blob: 9044d1e65b50a9211f2ac1d958641c7450a0c4e2 [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>pyspark.pandas.namespace &#8212; PySpark 3.3.4 documentation</title>
<link rel="stylesheet" href="../../../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css">
<link rel="stylesheet"
href="../../../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet"
href="../../../_static/vendor/open-sans_all/1.44.1/index.css">
<link rel="stylesheet"
href="../../../_static/vendor/lato_latin-ext/1.44.1/index.css">
<link rel="stylesheet" href="../../../_static/basic.css" type="text/css" />
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<link rel="preload" as="script" href="../../../_static/js/index.3da636dd464baa7582d2.js">
<script id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/language_data.js"></script>
<script src="../../../_static/clipboard.min.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/pandas/namespace.html" />
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="en" />
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main">
<div class="container-xl">
<a class="navbar-brand" href="../../../index.html">
<img src="../../../_static/spark-logo-reverse.png" class="logo" alt="logo" />
</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar-menu" class="col-lg-9 collapse navbar-collapse">
<ul id="navbar-main-elements" class="navbar-nav mr-auto">
<li class="nav-item ">
<a class="nav-link" href="../../../getting_started/index.html">Getting Started</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../user_guide/index.html">User Guide</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../reference/index.html">API Reference</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../development/index.html">Development</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../migration_guide/index.html">Migration Guide</a>
</li>
</ul>
<ul class="navbar-nav">
</ul>
</div>
</div>
</nav>
<div class="container-xl">
<div class="row">
<div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../../../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form>
<nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
<ul class="nav bd-sidenav">
</ul>
</nav>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
<nav id="bd-toc-nav">
<ul class="nav section-nav flex-column">
</ul>
</nav>
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<h1>Source code for pyspark.pandas.namespace</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd">Wrappers around spark that correspond to common pandas functions.</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">Any</span><span class="p">,</span>
<span class="n">Callable</span><span class="p">,</span>
<span class="n">Dict</span><span class="p">,</span>
<span class="n">List</span><span class="p">,</span>
<span class="n">Optional</span><span class="p">,</span>
<span class="n">Set</span><span class="p">,</span>
<span class="n">Sized</span><span class="p">,</span>
<span class="n">Tuple</span><span class="p">,</span>
<span class="n">Type</span><span class="p">,</span>
<span class="n">Union</span><span class="p">,</span>
<span class="n">cast</span><span class="p">,</span>
<span class="n">no_type_check</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">collections.abc</span> <span class="kn">import</span> <span class="n">Iterable</span>
<span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">tzinfo</span>
<span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">reduce</span>
<span class="kn">from</span> <span class="nn">io</span> <span class="kn">import</span> <span class="n">BytesIO</span>
<span class="kn">import</span> <span class="nn">json</span>
<span class="kn">import</span> <span class="nn">warnings</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="kn">from</span> <span class="nn">pandas.api.types</span> <span class="kn">import</span> <span class="p">(</span> <span class="c1"># type: ignore[attr-defined]</span>
<span class="n">is_datetime64_dtype</span><span class="p">,</span>
<span class="n">is_datetime64tz_dtype</span><span class="p">,</span>
<span class="n">is_list_like</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pandas.tseries.offsets</span> <span class="kn">import</span> <span class="n">DateOffset</span>
<span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
<span class="kn">import</span> <span class="nn">pyarrow.parquet</span> <span class="k">as</span> <span class="nn">pq</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">F</span><span class="p">,</span> <span class="n">Column</span><span class="p">,</span> <span class="n">DataFrame</span> <span class="k">as</span> <span class="n">SparkDataFrame</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">pandas_udf</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">ByteType</span><span class="p">,</span>
<span class="n">ShortType</span><span class="p">,</span>
<span class="n">IntegerType</span><span class="p">,</span>
<span class="n">LongType</span><span class="p">,</span>
<span class="n">FloatType</span><span class="p">,</span>
<span class="n">DoubleType</span><span class="p">,</span>
<span class="n">BooleanType</span><span class="p">,</span>
<span class="n">TimestampType</span><span class="p">,</span>
<span class="n">TimestampNTZType</span><span class="p">,</span>
<span class="n">DecimalType</span><span class="p">,</span>
<span class="n">StringType</span><span class="p">,</span>
<span class="n">DateType</span><span class="p">,</span>
<span class="n">StructType</span><span class="p">,</span>
<span class="n">DataType</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">pandas</span> <span class="k">as</span> <span class="n">ps</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas._typing</span> <span class="kn">import</span> <span class="n">Axis</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">,</span> <span class="n">Label</span><span class="p">,</span> <span class="n">Name</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.base</span> <span class="kn">import</span> <span class="n">IndexOpsMixin</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.utils</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">align_diff_frames</span><span class="p">,</span>
<span class="n">default_session</span><span class="p">,</span>
<span class="n">is_name_like_tuple</span><span class="p">,</span>
<span class="n">is_name_like_value</span><span class="p">,</span>
<span class="n">name_like_string</span><span class="p">,</span>
<span class="n">same_anchor</span><span class="p">,</span>
<span class="n">scol_for</span><span class="p">,</span>
<span class="n">validate_axis</span><span class="p">,</span>
<span class="n">log_advice</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.frame</span> <span class="kn">import</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">_reduce_spark_multi</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.internal</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">InternalFrame</span><span class="p">,</span>
<span class="n">DEFAULT_SERIES_NAME</span><span class="p">,</span>
<span class="n">HIDDEN_COLUMNS</span><span class="p">,</span>
<span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span><span class="p">,</span> <span class="n">first_series</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.spark</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">SF</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.spark.utils</span> <span class="kn">import</span> <span class="n">as_nullable_spark_type</span><span class="p">,</span> <span class="n">force_decimal_precision_scale</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.indexes</span> <span class="kn">import</span> <span class="n">Index</span><span class="p">,</span> <span class="n">DatetimeIndex</span><span class="p">,</span> <span class="n">TimedeltaIndex</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.indexes.multi</span> <span class="kn">import</span> <span class="n">MultiIndex</span>
<span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span>
<span class="s2">&quot;from_pandas&quot;</span><span class="p">,</span>
<span class="s2">&quot;range&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_csv&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_delta&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_table&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_spark_io&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_parquet&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_clipboard&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_excel&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_html&quot;</span><span class="p">,</span>
<span class="s2">&quot;to_datetime&quot;</span><span class="p">,</span>
<span class="s2">&quot;date_range&quot;</span><span class="p">,</span>
<span class="s2">&quot;to_timedelta&quot;</span><span class="p">,</span>
<span class="s2">&quot;timedelta_range&quot;</span><span class="p">,</span>
<span class="s2">&quot;get_dummies&quot;</span><span class="p">,</span>
<span class="s2">&quot;concat&quot;</span><span class="p">,</span>
<span class="s2">&quot;melt&quot;</span><span class="p">,</span>
<span class="s2">&quot;isna&quot;</span><span class="p">,</span>
<span class="s2">&quot;isnull&quot;</span><span class="p">,</span>
<span class="s2">&quot;notna&quot;</span><span class="p">,</span>
<span class="s2">&quot;notnull&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_sql_table&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_sql_query&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_sql&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_json&quot;</span><span class="p">,</span>
<span class="s2">&quot;merge&quot;</span><span class="p">,</span>
<span class="s2">&quot;merge_asof&quot;</span><span class="p">,</span>
<span class="s2">&quot;to_numeric&quot;</span><span class="p">,</span>
<span class="s2">&quot;broadcast&quot;</span><span class="p">,</span>
<span class="s2">&quot;read_orc&quot;</span><span class="p">,</span>
<span class="p">]</span>
<span class="k">def</span> <span class="nf">from_pandas</span><span class="p">(</span><span class="n">pobj</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">Index</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Create a pandas-on-Spark DataFrame, Series or Index from a pandas DataFrame, Series or Index.</span>
<span class="sd"> This is similar to Spark&#39;s `SparkSession.createDataFrame()` with pandas DataFrame,</span>
<span class="sd"> but this also works with pandas Series and picks the index.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> pobj : pandas.DataFrame or pandas.Series</span>
<span class="sd"> pandas DataFrame or Series to read.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame</span>
<span class="sd"> If a pandas Series is passed in, this function returns a pandas-on-Spark Series.</span>
<span class="sd"> If a pandas DataFrame is passed in, this function returns a pandas-on-Spark DataFrame.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pobj</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="k">return</span> <span class="n">Series</span><span class="p">(</span><span class="n">pobj</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pobj</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">pobj</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pobj</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">):</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="n">pobj</span><span class="p">))</span><span class="o">.</span><span class="n">index</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;Unknown data type: </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">pobj</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span>
<span class="n">_range</span> <span class="o">=</span> <span class="nb">range</span> <span class="c1"># built-in range</span>
<div class="viewcode-block" id="range"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.range.html#pyspark.pandas.range">[docs]</a><span class="k">def</span> <span class="nf">range</span><span class="p">(</span>
<span class="n">start</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">end</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">step</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">num_partitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create a DataFrame with some range of numbers.</span>
<span class="sd"> The resulting DataFrame has a single int64 column named `id`, containing elements in a range</span>
<span class="sd"> from ``start`` to ``end`` (exclusive) with step value ``step``. If only the first parameter</span>
<span class="sd"> (i.e. start) is specified, we treat it as the end value with the start value being 0.</span>
<span class="sd"> This is similar to the range function in SparkSession and is used primarily for testing.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> start : int</span>
<span class="sd"> the start value (inclusive)</span>
<span class="sd"> end : int, optional</span>
<span class="sd"> the end value (exclusive)</span>
<span class="sd"> step : int, optional, default 1</span>
<span class="sd"> the incremental step</span>
<span class="sd"> num_partitions : int, optional</span>
<span class="sd"> the number of partitions of the DataFrame</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> When the first parameter is specified, we generate a range of values up till that number.</span>
<span class="sd"> &gt;&gt;&gt; ps.range(5)</span>
<span class="sd"> id</span>
<span class="sd"> 0 0</span>
<span class="sd"> 1 1</span>
<span class="sd"> 2 2</span>
<span class="sd"> 3 3</span>
<span class="sd"> 4 4</span>
<span class="sd"> When start, end, and step are specified:</span>
<span class="sd"> &gt;&gt;&gt; ps.range(start = 100, end = 200, step = 20)</span>
<span class="sd"> id</span>
<span class="sd"> 0 100</span>
<span class="sd"> 1 120</span>
<span class="sd"> 2 140</span>
<span class="sd"> 3 160</span>
<span class="sd"> 4 180</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="n">start</span><span class="o">=</span><span class="n">start</span><span class="p">,</span> <span class="n">end</span><span class="o">=</span><span class="n">end</span><span class="p">,</span> <span class="n">step</span><span class="o">=</span><span class="n">step</span><span class="p">,</span> <span class="n">numPartitions</span><span class="o">=</span><span class="n">num_partitions</span><span class="p">)</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">sdf</span><span class="p">)</span></div>
<div class="viewcode-block" id="read_csv"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_csv.html#pyspark.pandas.read_csv">[docs]</a><span class="k">def</span> <span class="nf">read_csv</span><span class="p">(</span>
<span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">sep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;,&quot;</span><span class="p">,</span>
<span class="n">header</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="kc">None</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;infer&quot;</span><span class="p">,</span>
<span class="n">names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">usecols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Callable</span><span class="p">[[</span><span class="nb">str</span><span class="p">],</span> <span class="nb">bool</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">squeeze</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">mangle_dupe_cols</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">dtype</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">nrows</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">parse_dates</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">quotechar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">escapechar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">comment</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">encoding</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Read CSV (comma-separated) file into DataFrame or Series.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> path : str</span>
<span class="sd"> The path string storing the CSV file to be read.</span>
<span class="sd"> sep : str, default ‘,’</span>
<span class="sd"> Delimiter to use. Must be a single character.</span>
<span class="sd"> header : int, default ‘infer’</span>
<span class="sd"> Whether to to use as the column names, and the start of the data.</span>
<span class="sd"> Default behavior is to infer the column names: if no names are passed</span>
<span class="sd"> the behavior is identical to `header=0` and column names are inferred from</span>
<span class="sd"> the first line of the file, if column names are passed explicitly then</span>
<span class="sd"> the behavior is identical to `header=None`. Explicitly pass `header=0` to be</span>
<span class="sd"> able to replace existing names</span>
<span class="sd"> names : str or array-like, optional</span>
<span class="sd"> List of column names to use. If file contains no header row, then you should</span>
<span class="sd"> explicitly pass `header=None`. Duplicates in this list will cause an error to be issued.</span>
<span class="sd"> If a string is given, it should be a DDL-formatted string in Spark SQL, which is</span>
<span class="sd"> preferred to avoid schema inference for better performance.</span>
<span class="sd"> index_col: str or list of str, optional, default: None</span>
<span class="sd"> Index column of table in Spark.</span>
<span class="sd"> usecols : list-like or callable, optional</span>
<span class="sd"> Return a subset of the columns. If list-like, all elements must either be</span>
<span class="sd"> positional (i.e. integer indices into the document columns) or strings that</span>
<span class="sd"> correspond to column names provided either by the user in names or inferred</span>
<span class="sd"> from the document header row(s).</span>
<span class="sd"> If callable, the callable function will be evaluated against the column names,</span>
<span class="sd"> returning names where the callable function evaluates to `True`.</span>
<span class="sd"> squeeze : bool, default False</span>
<span class="sd"> If the parsed data only contains one column then return a Series.</span>
<span class="sd"> mangle_dupe_cols : bool, default True</span>
<span class="sd"> Duplicate columns will be specified as &#39;X0&#39;, &#39;X1&#39;, ... &#39;XN&#39;, rather</span>
<span class="sd"> than &#39;X&#39; ... &#39;X&#39;. Passing in False will cause data to be overwritten if</span>
<span class="sd"> there are duplicate names in the columns.</span>
<span class="sd"> Currently only `True` is allowed.</span>
<span class="sd"> dtype : Type name or dict of column -&gt; type, default None</span>
<span class="sd"> Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32} Use str or object</span>
<span class="sd"> together with suitable na_values settings to preserve and not interpret dtype.</span>
<span class="sd"> nrows : int, default None</span>
<span class="sd"> Number of rows to read from the CSV file.</span>
<span class="sd"> parse_dates : boolean or list of ints or names or list of lists or dict, default `False`.</span>
<span class="sd"> Currently only `False` is allowed.</span>
<span class="sd"> quotechar : str (length 1), optional</span>
<span class="sd"> The character used to denote the start and end of a quoted item. Quoted items can include</span>
<span class="sd"> the delimiter and it will be ignored.</span>
<span class="sd"> escapechar : str (length 1), default None</span>
<span class="sd"> One-character string used to escape delimiter</span>
<span class="sd"> comment: str, optional</span>
<span class="sd"> Indicates the line should not be parsed.</span>
<span class="sd"> encoding: str, optional</span>
<span class="sd"> Indicates the encoding to read file</span>
<span class="sd"> options : dict</span>
<span class="sd"> All other options passed directly into Spark&#39;s data source.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or Series</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.read_csv(&#39;data.csv&#39;) # doctest: +SKIP</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># For latin-1 encoding is same as iso-8859-1, that&#39;s why its mapped to iso-8859-1.</span>
<span class="n">encoding_mapping</span> <span class="o">=</span> <span class="p">{</span><span class="s2">&quot;latin-1&quot;</span><span class="p">:</span> <span class="s2">&quot;iso-8859-1&quot;</span><span class="p">}</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">mangle_dupe_cols</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">True</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;mangle_dupe_cols can only be `True`: </span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">mangle_dupe_cols</span><span class="p">)</span>
<span class="k">if</span> <span class="n">parse_dates</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">False</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;parse_dates can only be `False`: </span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">parse_dates</span><span class="p">)</span>
<span class="k">if</span> <span class="n">usecols</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">callable</span><span class="p">(</span><span class="n">usecols</span><span class="p">):</span>
<span class="n">usecols</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">usecols</span><span class="p">)</span> <span class="c1"># type: ignore[assignment]</span>
<span class="k">if</span> <span class="n">usecols</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="nb">callable</span><span class="p">(</span><span class="n">usecols</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">len</span><span class="p">(</span><span class="n">usecols</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">reader</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">read</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;inferSchema&quot;</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;sep&quot;</span><span class="p">,</span> <span class="n">sep</span><span class="p">)</span>
<span class="k">if</span> <span class="n">header</span> <span class="o">==</span> <span class="s2">&quot;infer&quot;</span><span class="p">:</span>
<span class="n">header</span> <span class="o">=</span> <span class="mi">0</span> <span class="k">if</span> <span class="n">names</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="kc">None</span>
<span class="k">if</span> <span class="n">header</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;header&quot;</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">header</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;header&quot;</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Unknown header argument </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">header</span><span class="p">))</span>
<span class="k">if</span> <span class="n">quotechar</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;quote&quot;</span><span class="p">,</span> <span class="n">quotechar</span><span class="p">)</span>
<span class="k">if</span> <span class="n">escapechar</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;escape&quot;</span><span class="p">,</span> <span class="n">escapechar</span><span class="p">)</span>
<span class="k">if</span> <span class="n">comment</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">comment</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">len</span><span class="p">(</span><span class="n">comment</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Only length-1 comment characters supported&quot;</span><span class="p">)</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;comment&quot;</span><span class="p">,</span> <span class="n">comment</span><span class="p">)</span>
<span class="n">reader</span><span class="o">.</span><span class="n">options</span><span class="p">(</span><span class="o">**</span><span class="n">options</span><span class="p">)</span>
<span class="k">if</span> <span class="n">encoding</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;encoding&quot;</span><span class="p">,</span> <span class="n">encoding_mapping</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">encoding</span><span class="p">,</span> <span class="n">encoding</span><span class="p">))</span>
<span class="n">column_labels</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="nb">str</span><span class="p">]</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">names</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">reader</span><span class="o">.</span><span class="n">schema</span><span class="p">(</span><span class="n">names</span><span class="p">)</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">{</span><span class="n">col</span><span class="p">:</span> <span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">}</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">reader</span><span class="o">.</span><span class="n">csv</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="k">if</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">names</span><span class="p">):</span>
<span class="n">names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">names</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">names</span><span class="p">))</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">names</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Found non-unique column index&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">names</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;The number of names [</span><span class="si">%s</span><span class="s2">] does not match the number &quot;</span>
<span class="s2">&quot;of columns [</span><span class="si">%d</span><span class="s2">]. Try names by a Spark SQL DDL-formatted &quot;</span>
<span class="s2">&quot;string.&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">sdf</span><span class="o">.</span><span class="n">schema</span><span class="p">),</span> <span class="nb">len</span><span class="p">(</span><span class="n">names</span><span class="p">))</span>
<span class="p">)</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span><span class="n">names</span><span class="p">,</span> <span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span>
<span class="k">elif</span> <span class="n">header</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span><span class="nb">enumerate</span><span class="p">(</span><span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">{</span><span class="n">col</span><span class="p">:</span> <span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">}</span>
<span class="k">if</span> <span class="n">usecols</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">missing</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span>
<span class="k">if</span> <span class="nb">callable</span><span class="p">(</span><span class="n">usecols</span><span class="p">):</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">{</span>
<span class="n">label</span><span class="p">:</span> <span class="n">col</span> <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> <span class="k">if</span> <span class="n">usecols</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="p">}</span>
<span class="n">missing</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">elif</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">usecols</span><span class="p">):</span>
<span class="n">usecols_ints</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">usecols</span><span class="p">)</span>
<span class="n">new_column_labels</span> <span class="o">=</span> <span class="p">{</span>
<span class="n">label</span><span class="p">:</span> <span class="n">col</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">column_labels</span><span class="o">.</span><span class="n">items</span><span class="p">())</span>
<span class="k">if</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">usecols_ints</span>
<span class="p">}</span>
<span class="n">missing</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">col</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">usecols_ints</span>
<span class="k">if</span> <span class="p">(</span>
<span class="n">col</span> <span class="o">&gt;=</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span>
<span class="ow">or</span> <span class="nb">list</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)[</span><span class="n">col</span><span class="p">]</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">new_column_labels</span>
<span class="p">)</span>
<span class="p">]</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="n">new_column_labels</span>
<span class="k">elif</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">usecols</span><span class="p">):</span>
<span class="n">new_column_labels</span> <span class="o">=</span> <span class="p">{</span>
<span class="n">label</span><span class="p">:</span> <span class="n">col</span> <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">usecols</span>
<span class="p">}</span>
<span class="n">missing</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">usecols</span> <span class="k">if</span> <span class="n">col</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">new_column_labels</span><span class="p">]</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="n">new_column_labels</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;&#39;usecols&#39; must either be list-like of all strings, &quot;</span>
<span class="s2">&quot;all unicode, all integers or a callable.&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">missing</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Usecols do not match columns, columns expected but not &quot;</span> <span class="s2">&quot;found: </span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">missing</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">([</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="o">.</span><span class="n">values</span><span class="p">()])</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([],</span> <span class="n">schema</span><span class="o">=</span><span class="n">StructType</span><span class="p">())</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([],</span> <span class="n">schema</span><span class="o">=</span><span class="n">StructType</span><span class="p">())</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">{}</span>
<span class="k">if</span> <span class="n">nrows</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="n">nrows</span><span class="p">)</span>
<span class="n">index_spark_column_names</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span>
<span class="n">index_names</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span>
<span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">index_col</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">)):</span>
<span class="n">index_col</span> <span class="o">=</span> <span class="p">[</span><span class="n">index_col</span><span class="p">]</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_col</span><span class="p">:</span>
<span class="k">if</span> <span class="n">col</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">col</span><span class="p">)</span>
<span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">column_labels</span><span class="p">[</span><span class="n">col</span><span class="p">]</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_col</span><span class="p">]</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="p">[(</span><span class="n">col</span><span class="p">,)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_col</span><span class="p">]</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">{</span>
<span class="n">label</span><span class="p">:</span> <span class="n">col</span> <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">index_col</span>
<span class="p">}</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;If `index_col` is not specified for `read_csv`, &quot;</span>
<span class="s2">&quot;the default index is attached which can cause additional overhead.&quot;</span>
<span class="p">)</span>
<span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_spark_column_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[</span>
<span class="n">label</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">label</span><span class="p">,)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span>
<span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="o">.</span><span class="n">values</span><span class="p">()],</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">dtype</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">dtype</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span>
<span class="k">for</span> <span class="n">col</span><span class="p">,</span> <span class="n">tpe</span> <span class="ow">in</span> <span class="n">dtype</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="n">psdf</span><span class="p">[</span><span class="n">col</span><span class="p">]</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">col</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">tpe</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span><span class="p">:</span>
<span class="n">psdf</span><span class="p">[</span><span class="n">col</span><span class="p">]</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">col</span><span class="p">]</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span>
<span class="k">if</span> <span class="n">squeeze</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">psdf</span></div>
<div class="viewcode-block" id="read_json"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_json.html#pyspark.pandas.read_json">[docs]</a><span class="k">def</span> <span class="nf">read_json</span><span class="p">(</span>
<span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">lines</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert a JSON string to DataFrame.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> path : string</span>
<span class="sd"> File path</span>
<span class="sd"> lines : bool, default True</span>
<span class="sd"> Read the file as a json object per line. It should be always True for now.</span>
<span class="sd"> index_col : str or list of str, optional, default: None</span>
<span class="sd"> Index column of table in Spark.</span>
<span class="sd"> options : dict</span>
<span class="sd"> All other options passed directly into Spark&#39;s data source.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([[&#39;a&#39;, &#39;b&#39;], [&#39;c&#39;, &#39;d&#39;]],</span>
<span class="sd"> ... columns=[&#39;col 1&#39;, &#39;col 2&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.to_json(path=r&#39;%s/read_json/foo.json&#39; % path, num_files=1)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_json(</span>
<span class="sd"> ... path=r&#39;%s/read_json/foo.json&#39; % path</span>
<span class="sd"> ... ).sort_values(by=&quot;col 1&quot;)</span>
<span class="sd"> col 1 col 2</span>
<span class="sd"> 0 a b</span>
<span class="sd"> 1 c d</span>
<span class="sd"> &gt;&gt;&gt; df.to_json(path=r&#39;%s/read_json/foo.json&#39; % path, num_files=1, lineSep=&#39;___&#39;)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_json(</span>
<span class="sd"> ... path=r&#39;%s/read_json/foo.json&#39; % path, lineSep=&#39;___&#39;</span>
<span class="sd"> ... ).sort_values(by=&quot;col 1&quot;)</span>
<span class="sd"> col 1 col 2</span>
<span class="sd"> 0 a b</span>
<span class="sd"> 1 c d</span>
<span class="sd"> You can preserve the index in the roundtrip as below.</span>
<span class="sd"> &gt;&gt;&gt; df.to_json(path=r&#39;%s/read_json/bar.json&#39; % path, num_files=1, index_col=&quot;index&quot;)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_json(</span>
<span class="sd"> ... path=r&#39;%s/read_json/bar.json&#39; % path, index_col=&quot;index&quot;</span>
<span class="sd"> ... ).sort_values(by=&quot;col 1&quot;) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> col 1 col 2</span>
<span class="sd"> index</span>
<span class="sd"> 0 a b</span>
<span class="sd"> 1 c d</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;If `index_col` is not specified for `read_json`, &quot;</span>
<span class="s2">&quot;the default index is attached which can cause additional overhead.&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">lines</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;lines=False is not implemented yet.&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">read_spark_io</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s2">&quot;json&quot;</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span></div>
<div class="viewcode-block" id="read_delta"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_delta.html#pyspark.pandas.read_delta">[docs]</a><span class="k">def</span> <span class="nf">read_delta</span><span class="p">(</span>
<span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">version</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">timestamp</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read a Delta Lake table on some file system and return a DataFrame.</span>
<span class="sd"> If the Delta Lake table is already stored in the catalog (aka the metastore), use &#39;read_table&#39;.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> path : string</span>
<span class="sd"> Path to the Delta Lake table.</span>
<span class="sd"> version : string, optional</span>
<span class="sd"> Specifies the table version (based on Delta&#39;s internal transaction version) to read from,</span>
<span class="sd"> using Delta&#39;s time travel feature. This sets Delta&#39;s &#39;versionAsOf&#39; option. Note that</span>
<span class="sd"> this parameter and `timestamp` parameter cannot be used together, otherwise it will raise a</span>
<span class="sd"> `ValueError`.</span>
<span class="sd"> timestamp : string, optional</span>
<span class="sd"> Specifies the table version (based on timestamp) to read from,</span>
<span class="sd"> using Delta&#39;s time travel feature. This must be a valid date or timestamp string in Spark,</span>
<span class="sd"> and sets Delta&#39;s &#39;timestampAsOf&#39; option. Note that this parameter and `version` parameter</span>
<span class="sd"> cannot be used together, otherwise it will raise a `ValueError`.</span>
<span class="sd"> index_col : str or list of str, optional, default: None</span>
<span class="sd"> Index column of table in Spark.</span>
<span class="sd"> options</span>
<span class="sd"> Additional options that can be passed onto Delta.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.to_delta</span>
<span class="sd"> read_table</span>
<span class="sd"> read_spark_io</span>
<span class="sd"> read_parquet</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.range(1).to_delta(&#39;%s/read_delta/foo&#39; % path) # doctest: +SKIP</span>
<span class="sd"> &gt;&gt;&gt; ps.read_delta(&#39;%s/read_delta/foo&#39; % path) # doctest: +SKIP</span>
<span class="sd"> id</span>
<span class="sd"> 0 0</span>
<span class="sd"> &gt;&gt;&gt; ps.range(10, 15, num_partitions=1).to_delta(&#39;%s/read_delta/foo&#39; % path,</span>
<span class="sd"> ... mode=&#39;overwrite&#39;) # doctest: +SKIP</span>
<span class="sd"> &gt;&gt;&gt; ps.read_delta(&#39;%s/read_delta/foo&#39; % path) # doctest: +SKIP</span>
<span class="sd"> id</span>
<span class="sd"> 0 10</span>
<span class="sd"> 1 11</span>
<span class="sd"> 2 12</span>
<span class="sd"> 3 13</span>
<span class="sd"> 4 14</span>
<span class="sd"> &gt;&gt;&gt; ps.read_delta(&#39;%s/read_delta/foo&#39; % path, version=0) # doctest: +SKIP</span>
<span class="sd"> id</span>
<span class="sd"> 0 0</span>
<span class="sd"> You can preserve the index in the roundtrip as below.</span>
<span class="sd"> &gt;&gt;&gt; ps.range(10, 15, num_partitions=1).to_delta(</span>
<span class="sd"> ... &#39;%s/read_delta/bar&#39; % path, index_col=&quot;index&quot;) # doctest: +SKIP</span>
<span class="sd"> &gt;&gt;&gt; ps.read_delta(&#39;%s/read_delta/bar&#39; % path, index_col=&quot;index&quot;) # doctest: +SKIP</span>
<span class="sd"> id</span>
<span class="sd"> index</span>
<span class="sd"> 0 10</span>
<span class="sd"> 1 11</span>
<span class="sd"> 2 12</span>
<span class="sd"> 3 13</span>
<span class="sd"> 4 14</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;If `index_col` is not specified for `read_delta`, &quot;</span>
<span class="s2">&quot;the default index is attached which can cause additional overhead.&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">version</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">timestamp</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;version and timestamp cannot be used together.&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">version</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">options</span><span class="p">[</span><span class="s2">&quot;versionAsOf&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">version</span>
<span class="k">if</span> <span class="n">timestamp</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">options</span><span class="p">[</span><span class="s2">&quot;timestampAsOf&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">timestamp</span>
<span class="k">return</span> <span class="n">read_spark_io</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s2">&quot;delta&quot;</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span></div>
<div class="viewcode-block" id="read_table"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_table.html#pyspark.pandas.read_table">[docs]</a><span class="k">def</span> <span class="nf">read_table</span><span class="p">(</span><span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read a Spark table and return a DataFrame.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name : string</span>
<span class="sd"> Table name in Spark.</span>
<span class="sd"> index_col : str or list of str, optional, default: None</span>
<span class="sd"> Index column of table in Spark.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.to_table</span>
<span class="sd"> read_delta</span>
<span class="sd"> read_parquet</span>
<span class="sd"> read_spark_io</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.range(1).to_table(&#39;%s.my_table&#39; % db)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_table(&#39;%s.my_table&#39; % db)</span>
<span class="sd"> id</span>
<span class="sd"> 0 0</span>
<span class="sd"> &gt;&gt;&gt; ps.range(1).to_table(&#39;%s.my_table&#39; % db, index_col=&quot;index&quot;)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_table(&#39;%s.my_table&#39; % db, index_col=&quot;index&quot;) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> id</span>
<span class="sd"> index</span>
<span class="sd"> 0 0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;If `index_col` is not specified for `read_table`, &quot;</span>
<span class="s2">&quot;the default index is attached which can cause additional overhead.&quot;</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">table</span><span class="p">(</span><span class="n">name</span><span class="p">)</span>
<span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="n">_get_index_map</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_col</span><span class="p">)</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span>
<span class="p">)</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="read_spark_io"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_spark_io.html#pyspark.pandas.read_spark_io">[docs]</a><span class="k">def</span> <span class="nf">read_spark_io</span><span class="p">(</span>
<span class="n">path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="s2">&quot;StructType&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Load a DataFrame from a Spark data source.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> path : string, optional</span>
<span class="sd"> Path to the data source.</span>
<span class="sd"> format : string, optional</span>
<span class="sd"> Specifies the output data source format. Some common ones are:</span>
<span class="sd"> - &#39;delta&#39;</span>
<span class="sd"> - &#39;parquet&#39;</span>
<span class="sd"> - &#39;orc&#39;</span>
<span class="sd"> - &#39;json&#39;</span>
<span class="sd"> - &#39;csv&#39;</span>
<span class="sd"> schema : string or StructType, optional</span>
<span class="sd"> Input schema. If none, Spark tries to infer the schema automatically.</span>
<span class="sd"> The schema can either be a Spark StructType, or a DDL-formatted string like</span>
<span class="sd"> `col0 INT, col1 DOUBLE`.</span>
<span class="sd"> index_col : str or list of str, optional, default: None</span>
<span class="sd"> Index column of table in Spark.</span>
<span class="sd"> options : dict</span>
<span class="sd"> All other options passed directly into Spark&#39;s data source.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.to_spark_io</span>
<span class="sd"> DataFrame.read_table</span>
<span class="sd"> DataFrame.read_delta</span>
<span class="sd"> DataFrame.read_parquet</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.range(1).to_spark_io(&#39;%s/read_spark_io/data.parquet&#39; % path)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_spark_io(</span>
<span class="sd"> ... &#39;%s/read_spark_io/data.parquet&#39; % path, format=&#39;parquet&#39;, schema=&#39;id long&#39;)</span>
<span class="sd"> id</span>
<span class="sd"> 0 0</span>
<span class="sd"> &gt;&gt;&gt; ps.range(10, 15, num_partitions=1).to_spark_io(&#39;%s/read_spark_io/data.json&#39; % path,</span>
<span class="sd"> ... format=&#39;json&#39;, lineSep=&#39;__&#39;)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_spark_io(</span>
<span class="sd"> ... &#39;%s/read_spark_io/data.json&#39; % path, format=&#39;json&#39;, schema=&#39;id long&#39;, lineSep=&#39;__&#39;)</span>
<span class="sd"> id</span>
<span class="sd"> 0 10</span>
<span class="sd"> 1 11</span>
<span class="sd"> 2 12</span>
<span class="sd"> 3 13</span>
<span class="sd"> 4 14</span>
<span class="sd"> You can preserve the index in the roundtrip as below.</span>
<span class="sd"> &gt;&gt;&gt; ps.range(10, 15, num_partitions=1).to_spark_io(&#39;%s/read_spark_io/data.orc&#39; % path,</span>
<span class="sd"> ... format=&#39;orc&#39;, index_col=&quot;index&quot;)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_spark_io(</span>
<span class="sd"> ... path=r&#39;%s/read_spark_io/data.orc&#39; % path, format=&quot;orc&quot;, index_col=&quot;index&quot;)</span>
<span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> id</span>
<span class="sd"> index</span>
<span class="sd"> 0 10</span>
<span class="sd"> 1 11</span>
<span class="sd"> 2 12</span>
<span class="sd"> 3 13</span>
<span class="sd"> 4 14</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">path</span><span class="o">=</span><span class="n">path</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="nb">format</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="n">schema</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span>
<span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="n">_get_index_map</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_col</span><span class="p">)</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span>
<span class="p">)</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="read_parquet"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_parquet.html#pyspark.pandas.read_parquet">[docs]</a><span class="k">def</span> <span class="nf">read_parquet</span><span class="p">(</span>
<span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">pandas_metadata</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Load a parquet object from the file path, returning a DataFrame.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> path : string</span>
<span class="sd"> File path</span>
<span class="sd"> columns : list, default=None</span>
<span class="sd"> If not None, only these columns will be read from the file.</span>
<span class="sd"> index_col : str or list of str, optional, default: None</span>
<span class="sd"> Index column of table in Spark.</span>
<span class="sd"> pandas_metadata : bool, default: False</span>
<span class="sd"> If True, try to respect the metadata if the Parquet file is written from pandas.</span>
<span class="sd"> options : dict</span>
<span class="sd"> All other options passed directly into Spark&#39;s data source.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.to_parquet</span>
<span class="sd"> DataFrame.read_table</span>
<span class="sd"> DataFrame.read_delta</span>
<span class="sd"> DataFrame.read_spark_io</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.range(1).to_parquet(&#39;%s/read_spark_io/data.parquet&#39; % path)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_parquet(&#39;%s/read_spark_io/data.parquet&#39; % path, columns=[&#39;id&#39;])</span>
<span class="sd"> id</span>
<span class="sd"> 0 0</span>
<span class="sd"> You can preserve the index in the roundtrip as below.</span>
<span class="sd"> &gt;&gt;&gt; ps.range(1).to_parquet(&#39;%s/read_spark_io/data.parquet&#39; % path, index_col=&quot;index&quot;)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_parquet(&#39;%s/read_spark_io/data.parquet&#39; % path, columns=[&#39;id&#39;], index_col=&quot;index&quot;)</span>
<span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> id</span>
<span class="sd"> index</span>
<span class="sd"> 0 0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;If `index_col` is not specified for `read_parquet`, &quot;</span>
<span class="s2">&quot;the default index is attached which can cause additional overhead.&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">columns</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">pandas_metadata</span><span class="p">:</span>
<span class="c1"># Try to read pandas metadata</span>
<span class="nd">@pandas_udf</span><span class="p">(</span> <span class="c1"># type: ignore[call-overload]</span>
<span class="s2">&quot;index_col array&lt;string&gt;, index_names array&lt;string&gt;&quot;</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">read_index_metadata</span><span class="p">(</span><span class="n">pser</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="n">binary</span> <span class="o">=</span> <span class="n">pser</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">metadata</span> <span class="o">=</span> <span class="n">pq</span><span class="o">.</span><span class="n">ParquetFile</span><span class="p">(</span><span class="n">pa</span><span class="o">.</span><span class="n">BufferReader</span><span class="p">(</span><span class="n">binary</span><span class="p">))</span><span class="o">.</span><span class="n">metadata</span><span class="o">.</span><span class="n">metadata</span>
<span class="k">if</span> <span class="sa">b</span><span class="s2">&quot;pandas&quot;</span> <span class="ow">in</span> <span class="n">metadata</span><span class="p">:</span>
<span class="n">pandas_metadata</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">loads</span><span class="p">(</span><span class="n">metadata</span><span class="p">[</span><span class="sa">b</span><span class="s2">&quot;pandas&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s2">&quot;utf8&quot;</span><span class="p">))</span>
<span class="k">if</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">pandas_metadata</span><span class="p">[</span><span class="s2">&quot;index_columns&quot;</span><span class="p">]):</span>
<span class="n">index_col</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">pandas_metadata</span><span class="p">[</span><span class="s2">&quot;index_columns&quot;</span><span class="p">]:</span>
<span class="n">index_col</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col</span><span class="p">)</span>
<span class="k">for</span> <span class="n">column</span> <span class="ow">in</span> <span class="n">pandas_metadata</span><span class="p">[</span><span class="s2">&quot;columns&quot;</span><span class="p">]:</span>
<span class="k">if</span> <span class="n">column</span><span class="p">[</span><span class="s2">&quot;field_name&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="n">col</span><span class="p">:</span>
<span class="n">index_names</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">column</span><span class="p">[</span><span class="s2">&quot;name&quot;</span><span class="p">])</span>
<span class="k">break</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">index_names</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s2">&quot;index_col&quot;</span><span class="p">:</span> <span class="p">[</span><span class="n">index_col</span><span class="p">],</span> <span class="s2">&quot;index_names&quot;</span><span class="p">:</span> <span class="p">[</span><span class="n">index_names</span><span class="p">]})</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s2">&quot;index_col&quot;</span><span class="p">:</span> <span class="p">[</span><span class="kc">None</span><span class="p">],</span> <span class="s2">&quot;index_names&quot;</span><span class="p">:</span> <span class="p">[</span><span class="kc">None</span><span class="p">]})</span>
<span class="n">index_col</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">default_session</span><span class="p">()</span>
<span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;binaryFile&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">read_index_metadata</span><span class="p">(</span><span class="s2">&quot;content&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;index_metadata&quot;</span><span class="p">))</span>
<span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;index_metadata.*&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">read_spark_io</span><span class="p">(</span><span class="n">path</span><span class="o">=</span><span class="n">path</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s2">&quot;parquet&quot;</span><span class="p">,</span> <span class="n">options</span><span class="o">=</span><span class="n">options</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">)</span>
<span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">new_columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">c</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">columns</span> <span class="k">if</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span><span class="p">]</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">new_columns</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">new_columns</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([],</span> <span class="n">schema</span><span class="o">=</span><span class="n">StructType</span><span class="p">())</span>
<span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="n">_get_index_map</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_col</span><span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span>
<span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">index_names</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">names</span> <span class="o">=</span> <span class="n">index_names</span>
<span class="k">return</span> <span class="n">psdf</span></div>
<div class="viewcode-block" id="read_clipboard"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_clipboard.html#pyspark.pandas.read_clipboard">[docs]</a><span class="k">def</span> <span class="nf">read_clipboard</span><span class="p">(</span><span class="n">sep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="sa">r</span><span class="s2">&quot;\s+&quot;</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read text from clipboard and pass to read_csv. See read_csv for the</span>
<span class="sd"> full argument list</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> sep : str, default &#39;\s+&#39;</span>
<span class="sd"> A string or regex delimiter. The default of &#39;\s+&#39; denotes</span>
<span class="sd"> one or more whitespace characters.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.to_clipboard : Write text out to clipboard.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> parsed : DataFrame</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">from_pandas</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">read_clipboard</span><span class="p">(</span><span class="n">sep</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)))</span></div>
<div class="viewcode-block" id="read_excel"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_excel.html#pyspark.pandas.read_excel">[docs]</a><span class="k">def</span> <span class="nf">read_excel</span><span class="p">(</span>
<span class="n">io</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span>
<span class="n">sheet_name</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">]],</span> <span class="kc">None</span><span class="p">]</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
<span class="n">header</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
<span class="n">names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">usecols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">]],</span> <span class="n">Callable</span><span class="p">[[</span><span class="nb">str</span><span class="p">],</span> <span class="nb">bool</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">squeeze</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">dtype</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">engine</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">converters</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">true_values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">false_values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">skiprows</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">nrows</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">na_values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">keep_default_na</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">verbose</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">parse_dates</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Dict</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">date_parser</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">thousands</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">comment</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">skipfooter</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
<span class="n">convert_float</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">mangle_dupe_cols</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwds</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">]]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read an Excel file into a pandas-on-Spark DataFrame or Series.</span>
<span class="sd"> Support both `xls` and `xlsx` file extensions from a local filesystem or URL.</span>
<span class="sd"> Support an option to read a single sheet or a list of sheets.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> io : str, file descriptor, pathlib.Path, ExcelFile or xlrd.Book</span>
<span class="sd"> The string could be a URL. The value URL must be available in Spark&#39;s DataFrameReader.</span>
<span class="sd"> .. note::</span>
<span class="sd"> If the underlying Spark is below 3.0, the parameter as a string is not supported.</span>
<span class="sd"> You can use `ps.from_pandas(pd.read_excel(...))` as a workaround.</span>
<span class="sd"> sheet_name : str, int, list, or None, default 0</span>
<span class="sd"> Strings are used for sheet names. Integers are used in zero-indexed</span>
<span class="sd"> sheet positions. Lists of strings/integers are used to request</span>
<span class="sd"> multiple sheets. Specify None to get all sheets.</span>
<span class="sd"> Available cases:</span>
<span class="sd"> * Defaults to ``0``: 1st sheet as a `DataFrame`</span>
<span class="sd"> * ``1``: 2nd sheet as a `DataFrame`</span>
<span class="sd"> * ``&quot;Sheet1&quot;``: Load sheet with name &quot;Sheet1&quot;</span>
<span class="sd"> * ``[0, 1, &quot;Sheet5&quot;]``: Load first, second and sheet named &quot;Sheet5&quot;</span>
<span class="sd"> as a dict of `DataFrame`</span>
<span class="sd"> * None: All sheets.</span>
<span class="sd"> header : int, list of int, default 0</span>
<span class="sd"> Row (0-indexed) to use for the column labels of the parsed</span>
<span class="sd"> DataFrame. If a list of integers is passed those row positions will</span>
<span class="sd"> be combined into a ``MultiIndex``. Use None if there is no header.</span>
<span class="sd"> names : array-like, default None</span>
<span class="sd"> List of column names to use. If file contains no header row,</span>
<span class="sd"> then you should explicitly pass header=None.</span>
<span class="sd"> index_col : int, list of int, default None</span>
<span class="sd"> Column (0-indexed) to use as the row labels of the DataFrame.</span>
<span class="sd"> Pass None if there is no such column. If a list is passed,</span>
<span class="sd"> those columns will be combined into a ``MultiIndex``. If a</span>
<span class="sd"> subset of data is selected with ``usecols``, index_col</span>
<span class="sd"> is based on the subset.</span>
<span class="sd"> usecols : int, str, list-like, or callable default None</span>
<span class="sd"> Return a subset of the columns.</span>
<span class="sd"> * If None, then parse all columns.</span>
<span class="sd"> * If str, then indicates comma separated list of Excel column letters</span>
<span class="sd"> and column ranges (e.g. &quot;A:E&quot; or &quot;A,C,E:F&quot;). Ranges are inclusive of</span>
<span class="sd"> both sides.</span>
<span class="sd"> * If list of int, then indicates list of column numbers to be parsed.</span>
<span class="sd"> * If list of string, then indicates list of column names to be parsed.</span>
<span class="sd"> * If callable, then evaluate each column name against it and parse the</span>
<span class="sd"> column if the callable returns ``True``.</span>
<span class="sd"> squeeze : bool, default False</span>
<span class="sd"> If the parsed data only contains one column then return a Series.</span>
<span class="sd"> dtype : Type name or dict of column -&gt; type, default None</span>
<span class="sd"> Data type for data or columns. E.g. {&#39;a&#39;: np.float64, &#39;b&#39;: np.int32}</span>
<span class="sd"> Use `object` to preserve data as stored in Excel and not interpret dtype.</span>
<span class="sd"> If converters are specified, they will be applied INSTEAD</span>
<span class="sd"> of dtype conversion.</span>
<span class="sd"> engine : str, default None</span>
<span class="sd"> If io is not a buffer or path, this must be set to identify io.</span>
<span class="sd"> Acceptable values are None or xlrd.</span>
<span class="sd"> converters : dict, default None</span>
<span class="sd"> Dict of functions for converting values in certain columns. Keys can</span>
<span class="sd"> either be integers or column labels, values are functions that take one</span>
<span class="sd"> input argument, the Excel cell content, and return the transformed</span>
<span class="sd"> content.</span>
<span class="sd"> true_values : list, default None</span>
<span class="sd"> Values to consider as True.</span>
<span class="sd"> false_values : list, default None</span>
<span class="sd"> Values to consider as False.</span>
<span class="sd"> skiprows : list-like</span>
<span class="sd"> Rows to skip at the beginning (0-indexed).</span>
<span class="sd"> nrows : int, default None</span>
<span class="sd"> Number of rows to parse.</span>
<span class="sd"> na_values : scalar, str, list-like, or dict, default None</span>
<span class="sd"> Additional strings to recognize as NA/NaN. If dict passed, specific</span>
<span class="sd"> per-column NA values. By default the following values are interpreted</span>
<span class="sd"> as NaN.</span>
<span class="sd"> keep_default_na : bool, default True</span>
<span class="sd"> If na_values are specified and keep_default_na is False the default NaN</span>
<span class="sd"> values are overridden, otherwise they&#39;re appended to.</span>
<span class="sd"> verbose : bool, default False</span>
<span class="sd"> Indicate number of NA values placed in non-numeric columns.</span>
<span class="sd"> parse_dates : bool, list-like, or dict, default False</span>
<span class="sd"> The behavior is as follows:</span>
<span class="sd"> * bool. If True -&gt; try parsing the index.</span>
<span class="sd"> * list of int or names. e.g. If [1, 2, 3] -&gt; try parsing columns 1, 2, 3</span>
<span class="sd"> each as a separate date column.</span>
<span class="sd"> * list of lists. e.g. If [[1, 3]] -&gt; combine columns 1 and 3 and parse as</span>
<span class="sd"> a single date column.</span>
<span class="sd"> * dict, e.g. {{&#39;foo&#39; : [1, 3]}} -&gt; parse columns 1, 3 as date and call</span>
<span class="sd"> result &#39;foo&#39;</span>
<span class="sd"> If a column or index contains an unparseable date, the entire column or</span>
<span class="sd"> index will be returned unaltered as an object data type. For non-standard</span>
<span class="sd"> datetime parsing, use ``pd.to_datetime`` after ``pd.read_csv``</span>
<span class="sd"> Note: A fast-path exists for iso8601-formatted dates.</span>
<span class="sd"> date_parser : function, optional</span>
<span class="sd"> Function to use for converting a sequence of string columns to an array of</span>
<span class="sd"> datetime instances. The default uses ``dateutil.parser.parser`` to do the</span>
<span class="sd"> conversion. pandas-on-Spark will try to call `date_parser` in three different ways,</span>
<span class="sd"> advancing to the next if an exception occurs: 1) Pass one or more arrays</span>
<span class="sd"> (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the</span>
<span class="sd"> string values from the columns defined by `parse_dates` into a single array</span>
<span class="sd"> and pass that; and 3) call `date_parser` once for each row using one or</span>
<span class="sd"> more strings (corresponding to the columns defined by `parse_dates`) as</span>
<span class="sd"> arguments.</span>
<span class="sd"> thousands : str, default None</span>
<span class="sd"> Thousands separator for parsing string columns to numeric. Note that</span>
<span class="sd"> this parameter is only necessary for columns stored as TEXT in Excel,</span>
<span class="sd"> any numeric columns will automatically be parsed, regardless of display</span>
<span class="sd"> format.</span>
<span class="sd"> comment : str, default None</span>
<span class="sd"> Comments out remainder of line. Pass a character or characters to this</span>
<span class="sd"> argument to indicate comments in the input file. Any data between the</span>
<span class="sd"> comment string and the end of the current line is ignored.</span>
<span class="sd"> skipfooter : int, default 0</span>
<span class="sd"> Rows at the end to skip (0-indexed).</span>
<span class="sd"> convert_float : bool, default True</span>
<span class="sd"> Convert integral floats to int (i.e., 1.0 --&gt; 1). If False, all numeric</span>
<span class="sd"> data will be read in as floats: Excel stores all numbers as floats</span>
<span class="sd"> internally.</span>
<span class="sd"> mangle_dupe_cols : bool, default True</span>
<span class="sd"> Duplicate columns will be specified as &#39;X&#39;, &#39;X.1&#39;, ...&#39;X.N&#39;, rather than</span>
<span class="sd"> &#39;X&#39;...&#39;X&#39;. Passing in False will cause data to be overwritten if there</span>
<span class="sd"> are duplicate names in the columns.</span>
<span class="sd"> **kwds : optional</span>
<span class="sd"> Optional keyword arguments can be passed to ``TextFileReader``.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or dict of DataFrames</span>
<span class="sd"> DataFrame from the passed in Excel file. See notes in sheet_name</span>
<span class="sd"> argument for more information on when a dict of DataFrames is returned.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.to_excel : Write DataFrame to an Excel file.</span>
<span class="sd"> DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.</span>
<span class="sd"> read_csv : Read a comma-separated values (csv) file into DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> The file can be read using the file name as string or an open file object:</span>
<span class="sd"> &gt;&gt;&gt; ps.read_excel(&#39;tmp.xlsx&#39;, index_col=0) # doctest: +SKIP</span>
<span class="sd"> Name Value</span>
<span class="sd"> 0 string1 1</span>
<span class="sd"> 1 string2 2</span>
<span class="sd"> 2 #Comment 3</span>
<span class="sd"> &gt;&gt;&gt; ps.read_excel(open(&#39;tmp.xlsx&#39;, &#39;rb&#39;),</span>
<span class="sd"> ... sheet_name=&#39;Sheet3&#39;) # doctest: +SKIP</span>
<span class="sd"> Unnamed: 0 Name Value</span>
<span class="sd"> 0 0 string1 1</span>
<span class="sd"> 1 1 string2 2</span>
<span class="sd"> 2 2 #Comment 3</span>
<span class="sd"> Index and header can be specified via the `index_col` and `header` arguments</span>
<span class="sd"> &gt;&gt;&gt; ps.read_excel(&#39;tmp.xlsx&#39;, index_col=None, header=None) # doctest: +SKIP</span>
<span class="sd"> 0 1 2</span>
<span class="sd"> 0 NaN Name Value</span>
<span class="sd"> 1 0.0 string1 1</span>
<span class="sd"> 2 1.0 string2 2</span>
<span class="sd"> 3 2.0 #Comment 3</span>
<span class="sd"> Column types are inferred but can be explicitly specified</span>
<span class="sd"> &gt;&gt;&gt; ps.read_excel(&#39;tmp.xlsx&#39;, index_col=0,</span>
<span class="sd"> ... dtype={&#39;Name&#39;: str, &#39;Value&#39;: float}) # doctest: +SKIP</span>
<span class="sd"> Name Value</span>
<span class="sd"> 0 string1 1.0</span>
<span class="sd"> 1 string2 2.0</span>
<span class="sd"> 2 #Comment 3.0</span>
<span class="sd"> True, False, and NA values, and thousands separators have defaults,</span>
<span class="sd"> but can be explicitly specified, too. Supply the values you would like</span>
<span class="sd"> as strings or lists of strings!</span>
<span class="sd"> &gt;&gt;&gt; ps.read_excel(&#39;tmp.xlsx&#39;, index_col=0,</span>
<span class="sd"> ... na_values=[&#39;string1&#39;, &#39;string2&#39;]) # doctest: +SKIP</span>
<span class="sd"> Name Value</span>
<span class="sd"> 0 None 1</span>
<span class="sd"> 1 None 2</span>
<span class="sd"> 2 #Comment 3</span>
<span class="sd"> Comment lines in the excel input file can be skipped using the `comment` kwarg</span>
<span class="sd"> &gt;&gt;&gt; ps.read_excel(&#39;tmp.xlsx&#39;, index_col=0, comment=&#39;#&#39;) # doctest: +SKIP</span>
<span class="sd"> Name Value</span>
<span class="sd"> 0 string1 1.0</span>
<span class="sd"> 1 string2 2.0</span>
<span class="sd"> 2 None NaN</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">pd_read_excel</span><span class="p">(</span>
<span class="n">io_or_bin</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">sn</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">]],</span> <span class="kc">None</span><span class="p">],</span> <span class="n">sq</span><span class="p">:</span> <span class="nb">bool</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span>
<span class="n">io</span><span class="o">=</span><span class="n">BytesIO</span><span class="p">(</span><span class="n">io_or_bin</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">io_or_bin</span><span class="p">,</span> <span class="p">(</span><span class="nb">bytes</span><span class="p">,</span> <span class="nb">bytearray</span><span class="p">))</span> <span class="k">else</span> <span class="n">io_or_bin</span><span class="p">,</span>
<span class="n">sheet_name</span><span class="o">=</span><span class="n">sn</span><span class="p">,</span>
<span class="n">header</span><span class="o">=</span><span class="n">header</span><span class="p">,</span>
<span class="n">names</span><span class="o">=</span><span class="n">names</span><span class="p">,</span>
<span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span>
<span class="n">usecols</span><span class="o">=</span><span class="n">usecols</span><span class="p">,</span>
<span class="n">squeeze</span><span class="o">=</span><span class="n">sq</span><span class="p">,</span>
<span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span>
<span class="n">engine</span><span class="o">=</span><span class="n">engine</span><span class="p">,</span>
<span class="n">converters</span><span class="o">=</span><span class="n">converters</span><span class="p">,</span>
<span class="n">true_values</span><span class="o">=</span><span class="n">true_values</span><span class="p">,</span>
<span class="n">false_values</span><span class="o">=</span><span class="n">false_values</span><span class="p">,</span>
<span class="n">skiprows</span><span class="o">=</span><span class="n">skiprows</span><span class="p">,</span>
<span class="n">nrows</span><span class="o">=</span><span class="n">nrows</span><span class="p">,</span>
<span class="n">na_values</span><span class="o">=</span><span class="n">na_values</span><span class="p">,</span>
<span class="n">keep_default_na</span><span class="o">=</span><span class="n">keep_default_na</span><span class="p">,</span>
<span class="n">verbose</span><span class="o">=</span><span class="n">verbose</span><span class="p">,</span>
<span class="n">parse_dates</span><span class="o">=</span><span class="n">parse_dates</span><span class="p">,</span> <span class="c1"># type: ignore[arg-type]</span>
<span class="n">date_parser</span><span class="o">=</span><span class="n">date_parser</span><span class="p">,</span>
<span class="n">thousands</span><span class="o">=</span><span class="n">thousands</span><span class="p">,</span>
<span class="n">comment</span><span class="o">=</span><span class="n">comment</span><span class="p">,</span>
<span class="n">skipfooter</span><span class="o">=</span><span class="n">skipfooter</span><span class="p">,</span>
<span class="n">convert_float</span><span class="o">=</span><span class="n">convert_float</span><span class="p">,</span>
<span class="n">mangle_dupe_cols</span><span class="o">=</span><span class="n">mangle_dupe_cols</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwds</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">io</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="c1"># &#39;binaryFile&#39; format is available since Spark 3.0.0.</span>
<span class="n">binaries</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;binaryFile&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">io</span><span class="p">)</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;content&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
<span class="n">io_or_bin</span> <span class="o">=</span> <span class="n">binaries</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span>
<span class="n">single_file</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">binaries</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">io_or_bin</span> <span class="o">=</span> <span class="n">io</span>
<span class="n">single_file</span> <span class="o">=</span> <span class="kc">True</span>
<span class="n">pdf_or_psers</span> <span class="o">=</span> <span class="n">pd_read_excel</span><span class="p">(</span><span class="n">io_or_bin</span><span class="p">,</span> <span class="n">sn</span><span class="o">=</span><span class="n">sheet_name</span><span class="p">,</span> <span class="n">sq</span><span class="o">=</span><span class="n">squeeze</span><span class="p">)</span>
<span class="k">if</span> <span class="n">single_file</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pdf_or_psers</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span>
<span class="k">return</span> <span class="p">{</span>
<span class="n">sn</span><span class="p">:</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">],</span> <span class="n">from_pandas</span><span class="p">(</span><span class="n">pdf_or_pser</span><span class="p">))</span>
<span class="k">for</span> <span class="n">sn</span><span class="p">,</span> <span class="n">pdf_or_pser</span> <span class="ow">in</span> <span class="n">pdf_or_psers</span><span class="o">.</span><span class="n">items</span><span class="p">()</span>
<span class="p">}</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">],</span> <span class="n">from_pandas</span><span class="p">(</span><span class="n">pdf_or_psers</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">read_excel_on_spark</span><span class="p">(</span>
<span class="n">pdf_or_pser</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">],</span>
<span class="n">sn</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">]],</span> <span class="kc">None</span><span class="p">],</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">]:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pdf_or_pser</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">pdf_or_pser</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">pdf_or_pser</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">from_pandas</span><span class="p">(</span><span class="n">pdf</span><span class="p">))</span>
<span class="n">return_schema</span> <span class="o">=</span> <span class="n">force_decimal_precision_scale</span><span class="p">(</span>
<span class="n">as_nullable_spark_type</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">)</span><span class="o">.</span><span class="n">schema</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">output_func</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">concat</span><span class="p">(</span>
<span class="p">[</span><span class="n">pd_read_excel</span><span class="p">(</span><span class="nb">bin</span><span class="p">,</span> <span class="n">sn</span><span class="o">=</span><span class="n">sn</span><span class="p">,</span> <span class="n">sq</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> <span class="k">for</span> <span class="nb">bin</span> <span class="ow">in</span> <span class="n">pdf</span><span class="p">[</span><span class="n">pdf</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]]]</span>
<span class="p">)</span>
<span class="n">reset_index</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span>
<span class="k">for</span> <span class="n">name</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">reset_index</span><span class="o">.</span><span class="n">iteritems</span><span class="p">():</span>
<span class="n">dt</span> <span class="o">=</span> <span class="n">col</span><span class="o">.</span><span class="n">dtype</span>
<span class="k">if</span> <span class="n">is_datetime64_dtype</span><span class="p">(</span><span class="n">dt</span><span class="p">)</span> <span class="ow">or</span> <span class="n">is_datetime64tz_dtype</span><span class="p">(</span><span class="n">dt</span><span class="p">):</span>
<span class="k">continue</span>
<span class="n">reset_index</span><span class="p">[</span><span class="n">name</span><span class="p">]</span> <span class="o">=</span> <span class="n">col</span><span class="o">.</span><span class="n">replace</span><span class="p">({</span><span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">:</span> <span class="kc">None</span><span class="p">})</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">reset_index</span>
<span class="c1"># Just positionally map the column names to given schema&#39;s.</span>
<span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="nb">dict</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">columns</span><span class="p">,</span> <span class="n">return_schema</span><span class="o">.</span><span class="n">names</span><span class="p">)))</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">default_session</span><span class="p">()</span>
<span class="o">.</span><span class="n">read</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;binaryFile&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">io</span><span class="p">)</span>
<span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">&quot;content&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">mapInPandas</span><span class="p">(</span><span class="k">lambda</span> <span class="n">iterator</span><span class="p">:</span> <span class="nb">map</span><span class="p">(</span><span class="n">output_func</span><span class="p">,</span> <span class="n">iterator</span><span class="p">),</span> <span class="n">schema</span><span class="o">=</span><span class="n">return_schema</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">))</span>
<span class="k">if</span> <span class="n">squeeze</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">psdf</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pdf_or_psers</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span>
<span class="k">return</span> <span class="p">{</span>
<span class="n">sn</span><span class="p">:</span> <span class="n">read_excel_on_spark</span><span class="p">(</span><span class="n">pdf_or_pser</span><span class="p">,</span> <span class="n">sn</span><span class="p">)</span> <span class="k">for</span> <span class="n">sn</span><span class="p">,</span> <span class="n">pdf_or_pser</span> <span class="ow">in</span> <span class="n">pdf_or_psers</span><span class="o">.</span><span class="n">items</span><span class="p">()</span>
<span class="p">}</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">read_excel_on_spark</span><span class="p">(</span><span class="n">pdf_or_psers</span><span class="p">,</span> <span class="n">sheet_name</span><span class="p">)</span></div>
<div class="viewcode-block" id="read_html"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_html.html#pyspark.pandas.read_html">[docs]</a><span class="k">def</span> <span class="nf">read_html</span><span class="p">(</span>
<span class="n">io</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span>
<span class="n">match</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;.+&quot;</span><span class="p">,</span>
<span class="n">flavor</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">header</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">skiprows</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="nb">slice</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">attrs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">parse_dates</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">thousands</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;,&quot;</span><span class="p">,</span>
<span class="n">encoding</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">decimal</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;.&quot;</span><span class="p">,</span>
<span class="n">converters</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">na_values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">keep_default_na</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">displayed_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">]:</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;Read HTML tables into a ``list`` of ``DataFrame`` objects.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> io : str or file-like</span>
<span class="sd"> A URL, a file-like object, or a raw string containing HTML. Note that</span>
<span class="sd"> lxml only accepts the http, ftp and file url protocols. If you have a</span>
<span class="sd"> URL that starts with ``&#39;https&#39;`` you might try removing the ``&#39;s&#39;``.</span>
<span class="sd"> match : str or compiled regular expression, optional</span>
<span class="sd"> The set of tables containing text matching this regex or string will be</span>
<span class="sd"> returned. Unless the HTML is extremely simple you will probably need to</span>
<span class="sd"> pass a non-empty string here. Defaults to &#39;.+&#39; (match any non-empty</span>
<span class="sd"> string). The default value will return all tables contained on a page.</span>
<span class="sd"> This value is converted to a regular expression so that there is</span>
<span class="sd"> consistent behavior between Beautiful Soup and lxml.</span>
<span class="sd"> flavor : str or None, container of strings</span>
<span class="sd"> The parsing engine to use. &#39;bs4&#39; and &#39;html5lib&#39; are synonymous with</span>
<span class="sd"> each other, they are both there for backwards compatibility. The</span>
<span class="sd"> default of ``None`` tries to use ``lxml`` to parse and if that fails it</span>
<span class="sd"> falls back on ``bs4`` + ``html5lib``.</span>
<span class="sd"> header : int or list-like or None, optional</span>
<span class="sd"> The row (or list of rows for a :class:`~ps.MultiIndex`) to use to</span>
<span class="sd"> make the columns headers.</span>
<span class="sd"> index_col : int or list-like or None, optional</span>
<span class="sd"> The column (or list of columns) to use to create the index.</span>
<span class="sd"> skiprows : int or list-like or slice or None, optional</span>
<span class="sd"> 0-based. Number of rows to skip after parsing the column integer. If a</span>
<span class="sd"> sequence of integers or a slice is given, will skip the rows indexed by</span>
<span class="sd"> that sequence. Note that a single element sequence means &#39;skip the nth</span>
<span class="sd"> row&#39; whereas an integer means &#39;skip n rows&#39;.</span>
<span class="sd"> attrs : dict or None, optional</span>
<span class="sd"> This is a dictionary of attributes that you can pass to use to identify</span>
<span class="sd"> the table in the HTML. These are not checked for validity before being</span>
<span class="sd"> passed to lxml or Beautiful Soup. However, these attributes must be</span>
<span class="sd"> valid HTML table attributes to work correctly. For example, ::</span>
<span class="sd"> attrs = {&#39;id&#39;: &#39;table&#39;}</span>
<span class="sd"> is a valid attribute dictionary because the &#39;id&#39; HTML tag attribute is</span>
<span class="sd"> a valid HTML attribute for *any* HTML tag as per `this document</span>
<span class="sd"> &lt;http://www.w3.org/TR/html-markup/global-attributes.html&gt;`__. ::</span>
<span class="sd"> attrs = {&#39;asdf&#39;: &#39;table&#39;}</span>
<span class="sd"> is *not* a valid attribute dictionary because &#39;asdf&#39; is not a valid</span>
<span class="sd"> HTML attribute even if it is a valid XML attribute. Valid HTML 4.01</span>
<span class="sd"> table attributes can be found `here</span>
<span class="sd"> &lt;http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2&gt;`__. A</span>
<span class="sd"> working draft of the HTML 5 spec can be found `here</span>
<span class="sd"> &lt;http://www.w3.org/TR/html-markup/table.html&gt;`__. It contains the</span>
<span class="sd"> latest information on table attributes for the modern web.</span>
<span class="sd"> parse_dates : bool, optional</span>
<span class="sd"> See :func:`~ps.read_csv` for more details.</span>
<span class="sd"> thousands : str, optional</span>
<span class="sd"> Separator to use to parse thousands. Defaults to ``&#39;,&#39;``.</span>
<span class="sd"> encoding : str or None, optional</span>
<span class="sd"> The encoding used to decode the web page. Defaults to ``None``.``None``</span>
<span class="sd"> preserves the previous encoding behavior, which depends on the</span>
<span class="sd"> underlying parser library (e.g., the parser library will try to use</span>
<span class="sd"> the encoding provided by the document).</span>
<span class="sd"> decimal : str, default &#39;.&#39;</span>
<span class="sd"> Character to recognize as decimal point (example: use &#39;,&#39; for European</span>
<span class="sd"> data).</span>
<span class="sd"> converters : dict, default None</span>
<span class="sd"> Dict of functions for converting values in certain columns. Keys can</span>
<span class="sd"> either be integers or column labels, values are functions that take one</span>
<span class="sd"> input argument, the cell (not column) content, and return the</span>
<span class="sd"> transformed content.</span>
<span class="sd"> na_values : iterable, default None</span>
<span class="sd"> Custom NA values</span>
<span class="sd"> keep_default_na : bool, default True</span>
<span class="sd"> If na_values are specified and keep_default_na is False the default NaN</span>
<span class="sd"> values are overridden, otherwise they&#39;re appended to</span>
<span class="sd"> displayed_only : bool, default True</span>
<span class="sd"> Whether elements with &quot;display: none&quot; should be parsed</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> dfs : list of DataFrames</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> read_csv</span>
<span class="sd"> DataFrame.to_html</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">pdfs</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_html</span><span class="p">(</span>
<span class="n">io</span><span class="o">=</span><span class="n">io</span><span class="p">,</span>
<span class="n">match</span><span class="o">=</span><span class="n">match</span><span class="p">,</span>
<span class="n">flavor</span><span class="o">=</span><span class="n">flavor</span><span class="p">,</span>
<span class="n">header</span><span class="o">=</span><span class="n">header</span><span class="p">,</span>
<span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span>
<span class="n">skiprows</span><span class="o">=</span><span class="n">skiprows</span><span class="p">,</span>
<span class="n">attrs</span><span class="o">=</span><span class="n">attrs</span><span class="p">,</span>
<span class="n">parse_dates</span><span class="o">=</span><span class="n">parse_dates</span><span class="p">,</span>
<span class="n">thousands</span><span class="o">=</span><span class="n">thousands</span><span class="p">,</span>
<span class="n">encoding</span><span class="o">=</span><span class="n">encoding</span><span class="p">,</span>
<span class="n">decimal</span><span class="o">=</span><span class="n">decimal</span><span class="p">,</span>
<span class="n">converters</span><span class="o">=</span><span class="n">converters</span><span class="p">,</span>
<span class="n">na_values</span><span class="o">=</span><span class="n">na_values</span><span class="p">,</span>
<span class="n">keep_default_na</span><span class="o">=</span><span class="n">keep_default_na</span><span class="p">,</span>
<span class="n">displayed_only</span><span class="o">=</span><span class="n">displayed_only</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">List</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">],</span> <span class="p">[</span><span class="n">from_pandas</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="k">for</span> <span class="n">pdf</span> <span class="ow">in</span> <span class="n">pdfs</span><span class="p">])</span></div>
<span class="c1"># TODO: add `coerce_float` and &#39;parse_dates&#39; parameters</span>
<div class="viewcode-block" id="read_sql_table"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_sql_table.html#pyspark.pandas.read_sql_table">[docs]</a><span class="k">def</span> <span class="nf">read_sql_table</span><span class="p">(</span>
<span class="n">table_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">con</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read SQL database table into a DataFrame.</span>
<span class="sd"> Given a table name and a JDBC URI, returns a DataFrame.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> table_name : str</span>
<span class="sd"> Name of SQL table in database.</span>
<span class="sd"> con : str</span>
<span class="sd"> A JDBC URI could be provided as as str.</span>
<span class="sd"> .. note:: The URI must be JDBC URI instead of Python&#39;s database URI.</span>
<span class="sd"> schema : str, default None</span>
<span class="sd"> Name of SQL schema in database to query (if database flavor</span>
<span class="sd"> supports this). Uses default schema if None (default).</span>
<span class="sd"> index_col : str or list of str, optional, default: None</span>
<span class="sd"> Column(s) to set as index(MultiIndex).</span>
<span class="sd"> columns : list, default None</span>
<span class="sd"> List of column names to select from SQL table.</span>
<span class="sd"> options : dict</span>
<span class="sd"> All other options passed directly into Spark&#39;s JDBC data source.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> A SQL table is returned as two-dimensional data structure with labeled</span>
<span class="sd"> axes.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> read_sql_query : Read SQL query into a DataFrame.</span>
<span class="sd"> read_sql : Read SQL query or database table into a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.read_sql_table(&#39;table_name&#39;, &#39;jdbc:postgresql:db_name&#39;) # doctest: +SKIP</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="n">reader</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">read</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;dbtable&quot;</span><span class="p">,</span> <span class="n">table_name</span><span class="p">)</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;url&quot;</span><span class="p">,</span> <span class="n">con</span><span class="p">)</span>
<span class="k">if</span> <span class="n">schema</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">reader</span><span class="o">.</span><span class="n">schema</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span>
<span class="n">reader</span><span class="o">.</span><span class="n">options</span><span class="p">(</span><span class="o">**</span><span class="n">options</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">reader</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;jdbc&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">()</span>
<span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="n">_get_index_map</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_col</span><span class="p">)</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">columns</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">columns</span><span class="p">]</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">columns</span><span class="p">]</span>
<span class="k">return</span> <span class="n">psdf</span></div>
<span class="c1"># TODO: add `coerce_float`, `params`, and &#39;parse_dates&#39; parameters</span>
<div class="viewcode-block" id="read_sql_query"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_sql_query.html#pyspark.pandas.read_sql_query">[docs]</a><span class="k">def</span> <span class="nf">read_sql_query</span><span class="p">(</span>
<span class="n">sql</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">con</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Read SQL query into a DataFrame.</span>
<span class="sd"> Returns a DataFrame corresponding to the result set of the query</span>
<span class="sd"> string. Optionally provide an `index_col` parameter to use one of the</span>
<span class="sd"> columns as the index, otherwise default index will be used.</span>
<span class="sd"> .. note:: Some database might hit the issue of Spark: SPARK-27596</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> sql : string SQL query</span>
<span class="sd"> SQL query to be executed.</span>
<span class="sd"> con : str</span>
<span class="sd"> A JDBC URI could be provided as as str.</span>
<span class="sd"> .. note:: The URI must be JDBC URI instead of Python&#39;s database URI.</span>
<span class="sd"> index_col : string or list of strings, optional, default: None</span>
<span class="sd"> Column(s) to set as index(MultiIndex).</span>
<span class="sd"> options : dict</span>
<span class="sd"> All other options passed directly into Spark&#39;s JDBC data source.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> read_sql_table : Read SQL database table into a DataFrame.</span>
<span class="sd"> read_sql</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.read_sql_query(&#39;SELECT * FROM table_name&#39;, &#39;jdbc:postgresql:db_name&#39;) # doctest: +SKIP</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="n">reader</span> <span class="o">=</span> <span class="n">default_session</span><span class="p">()</span><span class="o">.</span><span class="n">read</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;query&quot;</span><span class="p">,</span> <span class="n">sql</span><span class="p">)</span>
<span class="n">reader</span><span class="o">.</span><span class="n">option</span><span class="p">(</span><span class="s2">&quot;url&quot;</span><span class="p">,</span> <span class="n">con</span><span class="p">)</span>
<span class="n">reader</span><span class="o">.</span><span class="n">options</span><span class="p">(</span><span class="o">**</span><span class="n">options</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">reader</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;jdbc&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">load</span><span class="p">()</span>
<span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span> <span class="o">=</span> <span class="n">_get_index_map</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_col</span><span class="p">)</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span>
<span class="p">)</span>
<span class="p">)</span></div>
<span class="c1"># TODO: add `coerce_float`, `params`, and &#39;parse_dates&#39; parameters</span>
<div class="viewcode-block" id="read_sql"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_sql.html#pyspark.pandas.read_sql">[docs]</a><span class="k">def</span> <span class="nf">read_sql</span><span class="p">(</span>
<span class="n">sql</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">con</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Read SQL query or database table into a DataFrame.</span>
<span class="sd"> This function is a convenience wrapper around ``read_sql_table`` and</span>
<span class="sd"> ``read_sql_query`` (for backward compatibility). It will delegate</span>
<span class="sd"> to the specific function depending on the provided input. A SQL query</span>
<span class="sd"> will be routed to ``read_sql_query``, while a database table name will</span>
<span class="sd"> be routed to ``read_sql_table``. Note that the delegated function might</span>
<span class="sd"> have more specific notes about their functionality not listed here.</span>
<span class="sd"> .. note:: Some database might hit the issue of Spark: SPARK-27596</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> sql : string</span>
<span class="sd"> SQL query to be executed or a table name.</span>
<span class="sd"> con : str</span>
<span class="sd"> A JDBC URI could be provided as as str.</span>
<span class="sd"> .. note:: The URI must be JDBC URI instead of Python&#39;s database URI.</span>
<span class="sd"> index_col : string or list of strings, optional, default: None</span>
<span class="sd"> Column(s) to set as index(MultiIndex).</span>
<span class="sd"> columns : list, default: None</span>
<span class="sd"> List of column names to select from SQL table (only used when reading</span>
<span class="sd"> a table).</span>
<span class="sd"> options : dict</span>
<span class="sd"> All other options passed directly into Spark&#39;s JDBC data source.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> read_sql_table : Read SQL database table into a DataFrame.</span>
<span class="sd"> read_sql_query : Read SQL query into a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.read_sql(&#39;table_name&#39;, &#39;jdbc:postgresql:db_name&#39;) # doctest: +SKIP</span>
<span class="sd"> &gt;&gt;&gt; ps.read_sql(&#39;SELECT * FROM table_name&#39;, &#39;jdbc:postgresql:db_name&#39;) # doctest: +SKIP</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="n">striped</span> <span class="o">=</span> <span class="n">sql</span><span class="o">.</span><span class="n">strip</span><span class="p">()</span>
<span class="k">if</span> <span class="s2">&quot; &quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">striped</span><span class="p">:</span> <span class="c1"># TODO: identify the table name or not more precisely.</span>
<span class="k">return</span> <span class="n">read_sql_table</span><span class="p">(</span><span class="n">sql</span><span class="p">,</span> <span class="n">con</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="n">columns</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">read_sql_query</span><span class="p">(</span><span class="n">sql</span><span class="p">,</span> <span class="n">con</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span></div>
<div class="viewcode-block" id="to_datetime"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.to_datetime.html#pyspark.pandas.to_datetime">[docs]</a><span class="nd">@no_type_check</span>
<span class="k">def</span> <span class="nf">to_datetime</span><span class="p">(</span>
<span class="n">arg</span><span class="p">,</span>
<span class="n">errors</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;raise&quot;</span><span class="p">,</span>
<span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">unit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">infer_datetime_format</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">origin</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;unix&quot;</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert argument to datetime.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> arg : integer, float, string, datetime, list, tuple, 1-d array, Series</span>
<span class="sd"> or DataFrame/dict-like</span>
<span class="sd"> errors : {&#39;ignore&#39;, &#39;raise&#39;, &#39;coerce&#39;}, default &#39;raise&#39;</span>
<span class="sd"> - If &#39;raise&#39;, then invalid parsing will raise an exception</span>
<span class="sd"> - If &#39;coerce&#39;, then invalid parsing will be set as NaT</span>
<span class="sd"> - If &#39;ignore&#39;, then invalid parsing will return the input</span>
<span class="sd"> format : string, default None</span>
<span class="sd"> strftime to parse time, eg &quot;%d/%m/%Y&quot;, note that &quot;%f&quot; will parse</span>
<span class="sd"> all the way up to nanoseconds.</span>
<span class="sd"> unit : string, default None</span>
<span class="sd"> unit of the arg (D,s,ms,us,ns) denote the unit, which is an</span>
<span class="sd"> integer or float number. This will be based off the origin.</span>
<span class="sd"> Example, with unit=&#39;ms&#39; and origin=&#39;unix&#39; (the default), this</span>
<span class="sd"> would calculate the number of milliseconds to the unix epoch start.</span>
<span class="sd"> infer_datetime_format : boolean, default False</span>
<span class="sd"> If True and no `format` is given, attempt to infer the format of the</span>
<span class="sd"> datetime strings, and if it can be inferred, switch to a faster</span>
<span class="sd"> method of parsing them. In some cases this can increase the parsing</span>
<span class="sd"> speed by ~5-10x.</span>
<span class="sd"> origin : scalar, default &#39;unix&#39;</span>
<span class="sd"> Define the reference date. The numeric values would be parsed as number</span>
<span class="sd"> of units (defined by `unit`) since this reference date.</span>
<span class="sd"> - If &#39;unix&#39; (or POSIX) time; origin is set to 1970-01-01.</span>
<span class="sd"> - If &#39;julian&#39;, unit must be &#39;D&#39;, and origin is set to beginning of</span>
<span class="sd"> Julian Calendar. Julian day number 0 is assigned to the day starting</span>
<span class="sd"> at noon on January 1, 4713 BC.</span>
<span class="sd"> - If Timestamp convertible, origin is set to Timestamp identified by</span>
<span class="sd"> origin.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> ret : datetime if parsing succeeded.</span>
<span class="sd"> Return type depends on input:</span>
<span class="sd"> - list-like: DatetimeIndex</span>
<span class="sd"> - Series: Series of datetime64 dtype</span>
<span class="sd"> - scalar: Timestamp</span>
<span class="sd"> In case when it is not possible to return designated types (e.g. when</span>
<span class="sd"> any element of input is before Timestamp.min or after Timestamp.max)</span>
<span class="sd"> return will have datetime.datetime type (or corresponding</span>
<span class="sd"> array/Series).</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Assembling a datetime from multiple columns of a DataFrame. The keys can be</span>
<span class="sd"> common abbreviations like [&#39;year&#39;, &#39;month&#39;, &#39;day&#39;, &#39;minute&#39;, &#39;second&#39;,</span>
<span class="sd"> &#39;ms&#39;, &#39;us&#39;, &#39;ns&#39;]) or plurals of the same</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;year&#39;: [2015, 2016],</span>
<span class="sd"> ... &#39;month&#39;: [2, 3],</span>
<span class="sd"> ... &#39;day&#39;: [4, 5]})</span>
<span class="sd"> &gt;&gt;&gt; ps.to_datetime(df)</span>
<span class="sd"> 0 2015-02-04</span>
<span class="sd"> 1 2016-03-05</span>
<span class="sd"> dtype: datetime64[ns]</span>
<span class="sd"> If a date does not meet the `timestamp limitations</span>
<span class="sd"> &lt;http://pandas.pydata.org/pandas-docs/stable/timeseries.html</span>
<span class="sd"> #timeseries-timestamp-limits&gt;`_, passing errors=&#39;ignore&#39;</span>
<span class="sd"> will return the original input instead of raising any exception.</span>
<span class="sd"> Passing errors=&#39;coerce&#39; will force an out-of-bounds date to NaT,</span>
<span class="sd"> in addition to forcing non-dates (or non-parseable dates) to NaT.</span>
<span class="sd"> &gt;&gt;&gt; ps.to_datetime(&#39;13000101&#39;, format=&#39;%Y%m%d&#39;, errors=&#39;ignore&#39;)</span>
<span class="sd"> datetime.datetime(1300, 1, 1, 0, 0)</span>
<span class="sd"> &gt;&gt;&gt; ps.to_datetime(&#39;13000101&#39;, format=&#39;%Y%m%d&#39;, errors=&#39;coerce&#39;)</span>
<span class="sd"> NaT</span>
<span class="sd"> Passing infer_datetime_format=True can often-times speedup a parsing</span>
<span class="sd"> if its not an ISO8601 format exactly, but in a regular format.</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([&#39;3/11/2000&#39;, &#39;3/12/2000&#39;, &#39;3/13/2000&#39;] * 1000)</span>
<span class="sd"> &gt;&gt;&gt; s.head()</span>
<span class="sd"> 0 3/11/2000</span>
<span class="sd"> 1 3/12/2000</span>
<span class="sd"> 2 3/13/2000</span>
<span class="sd"> 3 3/11/2000</span>
<span class="sd"> 4 3/12/2000</span>
<span class="sd"> dtype: object</span>
<span class="sd"> &gt;&gt;&gt; import timeit</span>
<span class="sd"> &gt;&gt;&gt; timeit.timeit(</span>
<span class="sd"> ... lambda: repr(ps.to_datetime(s, infer_datetime_format=True)),</span>
<span class="sd"> ... number = 1) # doctest: +SKIP</span>
<span class="sd"> 0.35832712500000063</span>
<span class="sd"> &gt;&gt;&gt; timeit.timeit(</span>
<span class="sd"> ... lambda: repr(ps.to_datetime(s, infer_datetime_format=False)),</span>
<span class="sd"> ... number = 1) # doctest: +SKIP</span>
<span class="sd"> 0.8895321660000004</span>
<span class="sd"> Using a unix epoch time</span>
<span class="sd"> &gt;&gt;&gt; ps.to_datetime(1490195805, unit=&#39;s&#39;)</span>
<span class="sd"> Timestamp(&#39;2017-03-22 15:16:45&#39;)</span>
<span class="sd"> &gt;&gt;&gt; ps.to_datetime(1490195805433502912, unit=&#39;ns&#39;)</span>
<span class="sd"> Timestamp(&#39;2017-03-22 15:16:45.433502912&#39;)</span>
<span class="sd"> Using a non-unix epoch origin</span>
<span class="sd"> &gt;&gt;&gt; ps.to_datetime([1, 2, 3], unit=&#39;D&#39;, origin=pd.Timestamp(&#39;1960-01-01&#39;))</span>
<span class="sd"> DatetimeIndex([&#39;1960-01-02&#39;, &#39;1960-01-03&#39;, &#39;1960-01-04&#39;], dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># mappings for assembling units</span>
<span class="c1"># From pandas: pandas.core.tools.datetimes</span>
<span class="n">_unit_map</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">&quot;year&quot;</span><span class="p">:</span> <span class="s2">&quot;year&quot;</span><span class="p">,</span>
<span class="s2">&quot;years&quot;</span><span class="p">:</span> <span class="s2">&quot;year&quot;</span><span class="p">,</span>
<span class="s2">&quot;month&quot;</span><span class="p">:</span> <span class="s2">&quot;month&quot;</span><span class="p">,</span>
<span class="s2">&quot;months&quot;</span><span class="p">:</span> <span class="s2">&quot;month&quot;</span><span class="p">,</span>
<span class="s2">&quot;day&quot;</span><span class="p">:</span> <span class="s2">&quot;day&quot;</span><span class="p">,</span>
<span class="s2">&quot;days&quot;</span><span class="p">:</span> <span class="s2">&quot;day&quot;</span><span class="p">,</span>
<span class="s2">&quot;hour&quot;</span><span class="p">:</span> <span class="s2">&quot;h&quot;</span><span class="p">,</span>
<span class="s2">&quot;hours&quot;</span><span class="p">:</span> <span class="s2">&quot;h&quot;</span><span class="p">,</span>
<span class="s2">&quot;minute&quot;</span><span class="p">:</span> <span class="s2">&quot;m&quot;</span><span class="p">,</span>
<span class="s2">&quot;minutes&quot;</span><span class="p">:</span> <span class="s2">&quot;m&quot;</span><span class="p">,</span>
<span class="s2">&quot;second&quot;</span><span class="p">:</span> <span class="s2">&quot;s&quot;</span><span class="p">,</span>
<span class="s2">&quot;seconds&quot;</span><span class="p">:</span> <span class="s2">&quot;s&quot;</span><span class="p">,</span>
<span class="s2">&quot;ms&quot;</span><span class="p">:</span> <span class="s2">&quot;ms&quot;</span><span class="p">,</span>
<span class="s2">&quot;millisecond&quot;</span><span class="p">:</span> <span class="s2">&quot;ms&quot;</span><span class="p">,</span>
<span class="s2">&quot;milliseconds&quot;</span><span class="p">:</span> <span class="s2">&quot;ms&quot;</span><span class="p">,</span>
<span class="s2">&quot;us&quot;</span><span class="p">:</span> <span class="s2">&quot;us&quot;</span><span class="p">,</span>
<span class="s2">&quot;microsecond&quot;</span><span class="p">:</span> <span class="s2">&quot;us&quot;</span><span class="p">,</span>
<span class="s2">&quot;microseconds&quot;</span><span class="p">:</span> <span class="s2">&quot;us&quot;</span><span class="p">,</span>
<span class="p">}</span>
<span class="k">def</span> <span class="nf">pandas_to_datetime</span><span class="p">(</span>
<span class="n">pser_or_pdf</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">],</span> <span class="n">cols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">[</span><span class="n">np</span><span class="o">.</span><span class="n">datetime64</span><span class="p">]:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pser_or_pdf</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">pser_or_pdf</span> <span class="o">=</span> <span class="n">pser_or_pdf</span><span class="p">[</span><span class="n">cols</span><span class="p">]</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_datetime</span><span class="p">(</span>
<span class="n">pser_or_pdf</span><span class="p">,</span>
<span class="n">errors</span><span class="o">=</span><span class="n">errors</span><span class="p">,</span>
<span class="nb">format</span><span class="o">=</span><span class="nb">format</span><span class="p">,</span>
<span class="n">unit</span><span class="o">=</span><span class="n">unit</span><span class="p">,</span>
<span class="n">infer_datetime_format</span><span class="o">=</span><span class="n">infer_datetime_format</span><span class="p">,</span>
<span class="n">origin</span><span class="o">=</span><span class="n">origin</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="k">return</span> <span class="n">arg</span><span class="o">.</span><span class="n">pandas_on_spark</span><span class="o">.</span><span class="n">transform_batch</span><span class="p">(</span><span class="n">pandas_to_datetime</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">):</span>
<span class="n">unit</span> <span class="o">=</span> <span class="p">{</span><span class="n">k</span><span class="p">:</span> <span class="n">_unit_map</span><span class="p">[</span><span class="n">k</span><span class="o">.</span><span class="n">lower</span><span class="p">()]</span> <span class="k">for</span> <span class="n">k</span> <span class="ow">in</span> <span class="n">arg</span><span class="o">.</span><span class="n">keys</span><span class="p">()</span> <span class="k">if</span> <span class="n">k</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="ow">in</span> <span class="n">_unit_map</span><span class="p">}</span>
<span class="n">unit_rev</span> <span class="o">=</span> <span class="p">{</span><span class="n">v</span><span class="p">:</span> <span class="n">k</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">unit</span><span class="o">.</span><span class="n">items</span><span class="p">()}</span>
<span class="n">list_cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">unit_rev</span><span class="p">[</span><span class="s2">&quot;year&quot;</span><span class="p">],</span> <span class="n">unit_rev</span><span class="p">[</span><span class="s2">&quot;month&quot;</span><span class="p">],</span> <span class="n">unit_rev</span><span class="p">[</span><span class="s2">&quot;day&quot;</span><span class="p">]]</span>
<span class="k">for</span> <span class="n">u</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;h&quot;</span><span class="p">,</span> <span class="s2">&quot;m&quot;</span><span class="p">,</span> <span class="s2">&quot;s&quot;</span><span class="p">,</span> <span class="s2">&quot;ms&quot;</span><span class="p">,</span> <span class="s2">&quot;us&quot;</span><span class="p">]:</span>
<span class="n">value</span> <span class="o">=</span> <span class="n">unit_rev</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">u</span><span class="p">)</span>
<span class="k">if</span> <span class="n">value</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">arg</span><span class="p">:</span>
<span class="n">list_cols</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">value</span><span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">arg</span><span class="p">[</span><span class="n">list_cols</span><span class="p">]</span>
<span class="k">return</span> <span class="n">psdf</span><span class="o">.</span><span class="n">pandas_on_spark</span><span class="o">.</span><span class="n">transform_batch</span><span class="p">(</span><span class="n">pandas_to_datetime</span><span class="p">,</span> <span class="n">list_cols</span><span class="p">)</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_datetime</span><span class="p">(</span>
<span class="n">arg</span><span class="p">,</span>
<span class="n">errors</span><span class="o">=</span><span class="n">errors</span><span class="p">,</span>
<span class="nb">format</span><span class="o">=</span><span class="nb">format</span><span class="p">,</span>
<span class="n">unit</span><span class="o">=</span><span class="n">unit</span><span class="p">,</span>
<span class="n">infer_datetime_format</span><span class="o">=</span><span class="n">infer_datetime_format</span><span class="p">,</span>
<span class="n">origin</span><span class="o">=</span><span class="n">origin</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="date_range"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.date_range.html#pyspark.pandas.date_range">[docs]</a><span class="k">def</span> <span class="nf">date_range</span><span class="p">(</span>
<span class="n">start</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">end</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">periods</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">freq</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">DateOffset</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">tz</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">tzinfo</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">normalize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">name</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">closed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DatetimeIndex</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return a fixed frequency DatetimeIndex.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> start : str or datetime-like, optional</span>
<span class="sd"> Left bound for generating dates.</span>
<span class="sd"> end : str or datetime-like, optional</span>
<span class="sd"> Right bound for generating dates.</span>
<span class="sd"> periods : int, optional</span>
<span class="sd"> Number of periods to generate.</span>
<span class="sd"> freq : str or DateOffset, default &#39;D&#39;</span>
<span class="sd"> Frequency strings can have multiples, e.g. &#39;5H&#39;.</span>
<span class="sd"> tz : str or tzinfo, optional</span>
<span class="sd"> Time zone name for returning localized DatetimeIndex, for example</span>
<span class="sd"> &#39;Asia/Hong_Kong&#39;. By default, the resulting DatetimeIndex is</span>
<span class="sd"> timezone-naive.</span>
<span class="sd"> normalize : bool, default False</span>
<span class="sd"> Normalize start/end dates to midnight before generating date range.</span>
<span class="sd"> name : str, default None</span>
<span class="sd"> Name of the resulting DatetimeIndex.</span>
<span class="sd"> closed : {None, &#39;left&#39;, &#39;right&#39;}, optional</span>
<span class="sd"> Make the interval closed with respect to the given frequency to</span>
<span class="sd"> the &#39;left&#39;, &#39;right&#39;, or both sides (None, the default).</span>
<span class="sd"> **kwargs</span>
<span class="sd"> For compatibility. Has no effect on the result.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> rng : DatetimeIndex</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DatetimeIndex : An immutable container for datetimes.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,</span>
<span class="sd"> exactly three must be specified. If ``freq`` is omitted, the resulting</span>
<span class="sd"> ``DatetimeIndex`` will have ``periods`` linearly spaced elements between</span>
<span class="sd"> ``start`` and ``end`` (closed on both sides).</span>
<span class="sd"> To learn more about the frequency strings, please see `this link</span>
<span class="sd"> &lt;https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases&gt;`__.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> **Specifying the values**</span>
<span class="sd"> The next four examples generate the same `DatetimeIndex`, but vary</span>
<span class="sd"> the combination of `start`, `end` and `periods`.</span>
<span class="sd"> Specify `start` and `end`, with the default daily frequency.</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(start=&#39;1/1/2018&#39;, end=&#39;1/08/2018&#39;) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> DatetimeIndex([&#39;2018-01-01&#39;, &#39;2018-01-02&#39;, &#39;2018-01-03&#39;, &#39;2018-01-04&#39;,</span>
<span class="sd"> &#39;2018-01-05&#39;, &#39;2018-01-06&#39;, &#39;2018-01-07&#39;, &#39;2018-01-08&#39;],</span>
<span class="sd"> dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> Specify `start` and `periods`, the number of periods (days).</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(start=&#39;1/1/2018&#39;, periods=8) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> DatetimeIndex([&#39;2018-01-01&#39;, &#39;2018-01-02&#39;, &#39;2018-01-03&#39;, &#39;2018-01-04&#39;,</span>
<span class="sd"> &#39;2018-01-05&#39;, &#39;2018-01-06&#39;, &#39;2018-01-07&#39;, &#39;2018-01-08&#39;],</span>
<span class="sd"> dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> Specify `end` and `periods`, the number of periods (days).</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(end=&#39;1/1/2018&#39;, periods=8) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> DatetimeIndex([&#39;2017-12-25&#39;, &#39;2017-12-26&#39;, &#39;2017-12-27&#39;, &#39;2017-12-28&#39;,</span>
<span class="sd"> &#39;2017-12-29&#39;, &#39;2017-12-30&#39;, &#39;2017-12-31&#39;, &#39;2018-01-01&#39;],</span>
<span class="sd"> dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> Specify `start`, `end`, and `periods`; the frequency is generated</span>
<span class="sd"> automatically (linearly spaced).</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(</span>
<span class="sd"> ... start=&#39;2018-04-24&#39;, end=&#39;2018-04-27&#39;, periods=3</span>
<span class="sd"> ... ) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> DatetimeIndex([&#39;2018-04-24 00:00:00&#39;, &#39;2018-04-25 12:00:00&#39;,</span>
<span class="sd"> &#39;2018-04-27 00:00:00&#39;],</span>
<span class="sd"> dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> **Other Parameters**</span>
<span class="sd"> Changed the `freq` (frequency) to ``&#39;M&#39;`` (month end frequency).</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(start=&#39;1/1/2018&#39;, periods=5, freq=&#39;M&#39;) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> DatetimeIndex([&#39;2018-01-31&#39;, &#39;2018-02-28&#39;, &#39;2018-03-31&#39;, &#39;2018-04-30&#39;,</span>
<span class="sd"> &#39;2018-05-31&#39;],</span>
<span class="sd"> dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> Multiples are allowed</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(start=&#39;1/1/2018&#39;, periods=5, freq=&#39;3M&#39;) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> DatetimeIndex([&#39;2018-01-31&#39;, &#39;2018-04-30&#39;, &#39;2018-07-31&#39;, &#39;2018-10-31&#39;,</span>
<span class="sd"> &#39;2019-01-31&#39;],</span>
<span class="sd"> dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> `freq` can also be specified as an Offset object.</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(</span>
<span class="sd"> ... start=&#39;1/1/2018&#39;, periods=5, freq=pd.offsets.MonthEnd(3)</span>
<span class="sd"> ... ) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> DatetimeIndex([&#39;2018-01-31&#39;, &#39;2018-04-30&#39;, &#39;2018-07-31&#39;, &#39;2018-10-31&#39;,</span>
<span class="sd"> &#39;2019-01-31&#39;],</span>
<span class="sd"> dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> `closed` controls whether to include `start` and `end` that are on the</span>
<span class="sd"> boundary. The default includes boundary points on either end.</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(</span>
<span class="sd"> ... start=&#39;2017-01-01&#39;, end=&#39;2017-01-04&#39;, closed=None</span>
<span class="sd"> ... ) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> DatetimeIndex([&#39;2017-01-01&#39;, &#39;2017-01-02&#39;, &#39;2017-01-03&#39;, &#39;2017-01-04&#39;],</span>
<span class="sd"> dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> Use ``closed=&#39;left&#39;`` to exclude `end` if it falls on the boundary.</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(</span>
<span class="sd"> ... start=&#39;2017-01-01&#39;, end=&#39;2017-01-04&#39;, closed=&#39;left&#39;</span>
<span class="sd"> ... ) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> DatetimeIndex([&#39;2017-01-01&#39;, &#39;2017-01-02&#39;, &#39;2017-01-03&#39;], dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> Use ``closed=&#39;right&#39;`` to exclude `start` if it falls on the boundary.</span>
<span class="sd"> &gt;&gt;&gt; ps.date_range(</span>
<span class="sd"> ... start=&#39;2017-01-01&#39;, end=&#39;2017-01-04&#39;, closed=&#39;right&#39;</span>
<span class="sd"> ... ) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> DatetimeIndex([&#39;2017-01-02&#39;, &#39;2017-01-03&#39;, &#39;2017-01-04&#39;], dtype=&#39;datetime64[ns]&#39;, freq=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">assert</span> <span class="n">freq</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;N&quot;</span><span class="p">,</span> <span class="s2">&quot;ns&quot;</span><span class="p">],</span> <span class="s2">&quot;nanoseconds is not supported&quot;</span>
<span class="k">assert</span> <span class="n">tz</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">,</span> <span class="s2">&quot;Localized DatetimeIndex is not supported&quot;</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span>
<span class="n">DatetimeIndex</span><span class="p">,</span>
<span class="n">ps</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span>
<span class="n">pd</span><span class="o">.</span><span class="n">date_range</span><span class="p">(</span>
<span class="n">start</span><span class="o">=</span><span class="n">start</span><span class="p">,</span>
<span class="n">end</span><span class="o">=</span><span class="n">end</span><span class="p">,</span>
<span class="n">periods</span><span class="o">=</span><span class="n">periods</span><span class="p">,</span>
<span class="n">freq</span><span class="o">=</span><span class="n">freq</span><span class="p">,</span>
<span class="n">tz</span><span class="o">=</span><span class="n">tz</span><span class="p">,</span>
<span class="n">normalize</span><span class="o">=</span><span class="n">normalize</span><span class="p">,</span>
<span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">,</span>
<span class="n">closed</span><span class="o">=</span><span class="n">closed</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">,</span>
<span class="p">)</span>
<span class="p">),</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="to_timedelta"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.to_timedelta.html#pyspark.pandas.to_timedelta">[docs]</a><span class="nd">@no_type_check</span>
<span class="k">def</span> <span class="nf">to_timedelta</span><span class="p">(</span>
<span class="n">arg</span><span class="p">,</span>
<span class="n">unit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">errors</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;raise&quot;</span><span class="p">,</span>
<span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert argument to timedelta.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> arg : str, timedelta, list-like or Series</span>
<span class="sd"> The data to be converted to timedelta.</span>
<span class="sd"> unit : str, optional</span>
<span class="sd"> Denotes the unit of the arg for numeric `arg`. Defaults to ``&quot;ns&quot;``.</span>
<span class="sd"> Possible values:</span>
<span class="sd"> * &#39;W&#39;</span>
<span class="sd"> * &#39;D&#39; / &#39;days&#39; / &#39;day&#39;</span>
<span class="sd"> * &#39;hours&#39; / &#39;hour&#39; / &#39;hr&#39; / &#39;h&#39;</span>
<span class="sd"> * &#39;m&#39; / &#39;minute&#39; / &#39;min&#39; / &#39;minutes&#39; / &#39;T&#39;</span>
<span class="sd"> * &#39;S&#39; / &#39;seconds&#39; / &#39;sec&#39; / &#39;second&#39;</span>
<span class="sd"> * &#39;ms&#39; / &#39;milliseconds&#39; / &#39;millisecond&#39; / &#39;milli&#39; / &#39;millis&#39; / &#39;L&#39;</span>
<span class="sd"> * &#39;us&#39; / &#39;microseconds&#39; / &#39;microsecond&#39; / &#39;micro&#39; / &#39;micros&#39; / &#39;U&#39;</span>
<span class="sd"> * &#39;ns&#39; / &#39;nanoseconds&#39; / &#39;nano&#39; / &#39;nanos&#39; / &#39;nanosecond&#39; / &#39;N&#39;</span>
<span class="sd"> Must not be specified when `arg` context strings and ``errors=&quot;raise&quot;``.</span>
<span class="sd"> errors : {&#39;ignore&#39;, &#39;raise&#39;, &#39;coerce&#39;}, default &#39;raise&#39;</span>
<span class="sd"> - If &#39;raise&#39;, then invalid parsing will raise an exception.</span>
<span class="sd"> - If &#39;coerce&#39;, then invalid parsing will be set as NaT.</span>
<span class="sd"> - If &#39;ignore&#39;, then invalid parsing will return the input.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> ret : timedelta64, TimedeltaIndex or Series of timedelta64 if parsing succeeded.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.astype : Cast argument to a specified dtype.</span>
<span class="sd"> to_datetime : Convert argument to datetime.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> If the precision is higher than nanoseconds, the precision of the duration is</span>
<span class="sd"> truncated to nanoseconds for string inputs.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Parsing a single string to a Timedelta:</span>
<span class="sd"> &gt;&gt;&gt; ps.to_timedelta(&#39;1 days 06:05:01.00003&#39;)</span>
<span class="sd"> Timedelta(&#39;1 days 06:05:01.000030&#39;)</span>
<span class="sd"> &gt;&gt;&gt; ps.to_timedelta(&#39;15.5us&#39;) # doctest: +SKIP</span>
<span class="sd"> Timedelta(&#39;0 days 00:00:00.000015500&#39;)</span>
<span class="sd"> Parsing a list or array of strings:</span>
<span class="sd"> &gt;&gt;&gt; ps.to_timedelta([&#39;1 days 06:05:01.00003&#39;, &#39;15.5us&#39;, &#39;nan&#39;]) # doctest: +SKIP</span>
<span class="sd"> TimedeltaIndex([&#39;1 days 06:05:01.000030&#39;, &#39;0 days 00:00:00.000015500&#39;, NaT],</span>
<span class="sd"> dtype=&#39;timedelta64[ns]&#39;, freq=None)</span>
<span class="sd"> Converting numbers by specifying the `unit` keyword argument:</span>
<span class="sd"> &gt;&gt;&gt; ps.to_timedelta(np.arange(5), unit=&#39;s&#39;) # doctest: +SKIP</span>
<span class="sd"> TimedeltaIndex([&#39;0 days 00:00:00&#39;, &#39;0 days 00:00:01&#39;, &#39;0 days 00:00:02&#39;,</span>
<span class="sd"> &#39;0 days 00:00:03&#39;, &#39;0 days 00:00:04&#39;],</span>
<span class="sd"> dtype=&#39;timedelta64[ns]&#39;, freq=None)</span>
<span class="sd"> &gt;&gt;&gt; ps.to_timedelta(np.arange(5), unit=&#39;d&#39;) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> TimedeltaIndex([&#39;0 days&#39;, &#39;1 days&#39;, &#39;2 days&#39;, &#39;3 days&#39;, &#39;4 days&#39;],</span>
<span class="sd"> dtype=&#39;timedelta64[ns]&#39;, freq=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">pandas_to_timedelta</span><span class="p">(</span><span class="n">pser</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">timedelta64</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_timedelta</span><span class="p">(</span>
<span class="n">arg</span><span class="o">=</span><span class="n">pser</span><span class="p">,</span>
<span class="n">unit</span><span class="o">=</span><span class="n">unit</span><span class="p">,</span>
<span class="n">errors</span><span class="o">=</span><span class="n">errors</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="k">return</span> <span class="n">arg</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">pandas_to_timedelta</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_timedelta</span><span class="p">(</span>
<span class="n">arg</span><span class="o">=</span><span class="n">arg</span><span class="p">,</span>
<span class="n">unit</span><span class="o">=</span><span class="n">unit</span><span class="p">,</span>
<span class="n">errors</span><span class="o">=</span><span class="n">errors</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="timedelta_range"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.timedelta_range.html#pyspark.pandas.timedelta_range">[docs]</a><span class="k">def</span> <span class="nf">timedelta_range</span><span class="p">(</span>
<span class="n">start</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">end</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">periods</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">freq</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">DateOffset</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">name</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">closed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">TimedeltaIndex</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return a fixed frequency TimedeltaIndex, with day as the default frequency.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> start : str or timedelta-like, optional</span>
<span class="sd"> Left bound for generating timedeltas.</span>
<span class="sd"> end : str or timedelta-like, optional</span>
<span class="sd"> Right bound for generating timedeltas.</span>
<span class="sd"> periods : int, optional</span>
<span class="sd"> Number of periods to generate.</span>
<span class="sd"> freq : str or DateOffset, default &#39;D&#39;</span>
<span class="sd"> Frequency strings can have multiples, e.g. &#39;5H&#39;.</span>
<span class="sd"> name : str, default None</span>
<span class="sd"> Name of the resulting TimedeltaIndex.</span>
<span class="sd"> closed : {None, &#39;left&#39;, &#39;right&#39;}, optional</span>
<span class="sd"> Make the interval closed with respect to the given frequency to</span>
<span class="sd"> the &#39;left&#39;, &#39;right&#39;, or both sides (None, the default).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> TimedeltaIndex</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,</span>
<span class="sd"> exactly three must be specified. If ``freq`` is omitted, the resulting</span>
<span class="sd"> ``TimedeltaIndex`` will have ``periods`` linearly spaced elements between</span>
<span class="sd"> ``start`` and ``end`` (closed on both sides).</span>
<span class="sd"> To learn more about the frequency strings, please see `this link</span>
<span class="sd"> &lt;https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases&gt;`__.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.timedelta_range(start=&#39;1 day&#39;, periods=4) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> TimedeltaIndex([&#39;1 days&#39;, &#39;2 days&#39;, &#39;3 days&#39;, &#39;4 days&#39;], dtype=&#39;timedelta64[ns]&#39;, freq=None)</span>
<span class="sd"> The closed parameter specifies which endpoint is included.</span>
<span class="sd"> The default behavior is to include both endpoints.</span>
<span class="sd"> &gt;&gt;&gt; ps.timedelta_range(start=&#39;1 day&#39;, periods=4, closed=&#39;right&#39;)</span>
<span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> TimedeltaIndex([&#39;2 days&#39;, &#39;3 days&#39;, &#39;4 days&#39;], dtype=&#39;timedelta64[ns]&#39;, freq=None)</span>
<span class="sd"> The freq parameter specifies the frequency of the TimedeltaIndex.</span>
<span class="sd"> Only fixed frequencies can be passed, non-fixed frequencies such as ‘M’ (month end) will raise.</span>
<span class="sd"> &gt;&gt;&gt; ps.timedelta_range(start=&#39;1 day&#39;, end=&#39;2 days&#39;, freq=&#39;6H&#39;)</span>
<span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> TimedeltaIndex([&#39;1 days 00:00:00&#39;, &#39;1 days 06:00:00&#39;, &#39;1 days 12:00:00&#39;,</span>
<span class="sd"> &#39;1 days 18:00:00&#39;, &#39;2 days 00:00:00&#39;],</span>
<span class="sd"> dtype=&#39;timedelta64[ns]&#39;, freq=None)</span>
<span class="sd"> Specify start, end, and periods; the frequency is generated automatically (linearly spaced).</span>
<span class="sd"> &gt;&gt;&gt; ps.timedelta_range(start=&#39;1 day&#39;, end=&#39;5 days&#39;, periods=4)</span>
<span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> TimedeltaIndex([&#39;1 days 00:00:00&#39;, &#39;2 days 08:00:00&#39;, &#39;3 days 16:00:00&#39;,</span>
<span class="sd"> &#39;5 days 00:00:00&#39;],</span>
<span class="sd"> dtype=&#39;timedelta64[ns]&#39;, freq=None)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">assert</span> <span class="n">freq</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;N&quot;</span><span class="p">,</span> <span class="s2">&quot;ns&quot;</span><span class="p">],</span> <span class="s2">&quot;nanoseconds is not supported&quot;</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span>
<span class="n">TimedeltaIndex</span><span class="p">,</span>
<span class="n">ps</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span>
<span class="n">pd</span><span class="o">.</span><span class="n">timedelta_range</span><span class="p">(</span>
<span class="n">start</span><span class="o">=</span><span class="n">start</span><span class="p">,</span>
<span class="n">end</span><span class="o">=</span><span class="n">end</span><span class="p">,</span>
<span class="n">periods</span><span class="o">=</span><span class="n">periods</span><span class="p">,</span>
<span class="n">freq</span><span class="o">=</span><span class="n">freq</span><span class="p">,</span>
<span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">,</span>
<span class="n">closed</span><span class="o">=</span><span class="n">closed</span><span class="p">,</span>
<span class="p">)</span>
<span class="p">),</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="get_dummies"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.get_dummies.html#pyspark.pandas.get_dummies">[docs]</a><span class="k">def</span> <span class="nf">get_dummies</span><span class="p">(</span>
<span class="n">data</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">],</span>
<span class="n">prefix</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">prefix_sep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;_&quot;</span><span class="p">,</span>
<span class="n">dummy_na</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">sparse</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">drop_first</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">dtype</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert categorical variable into dummy/indicator variables, also</span>
<span class="sd"> known as one hot encoding.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> data : array-like, Series, or DataFrame</span>
<span class="sd"> prefix : string, list of strings, or dict of strings, default None</span>
<span class="sd"> String to append DataFrame column names.</span>
<span class="sd"> Pass a list with length equal to the number of columns</span>
<span class="sd"> when calling get_dummies on a DataFrame. Alternatively, `prefix`</span>
<span class="sd"> can be a dictionary mapping column names to prefixes.</span>
<span class="sd"> prefix_sep : string, default &#39;_&#39;</span>
<span class="sd"> If appending prefix, separator/delimiter to use. Or pass a</span>
<span class="sd"> list or dictionary as with `prefix.`</span>
<span class="sd"> dummy_na : bool, default False</span>
<span class="sd"> Add a column to indicate NaNs, if False NaNs are ignored.</span>
<span class="sd"> columns : list-like, default None</span>
<span class="sd"> Column names in the DataFrame to be encoded.</span>
<span class="sd"> If `columns` is None then all the columns with</span>
<span class="sd"> `object` or `category` dtype will be converted.</span>
<span class="sd"> sparse : bool, default False</span>
<span class="sd"> Whether the dummy-encoded columns should be be backed by</span>
<span class="sd"> a :class:`SparseArray` (True) or a regular NumPy array (False).</span>
<span class="sd"> In pandas-on-Spark, this value must be &quot;False&quot;.</span>
<span class="sd"> drop_first : bool, default False</span>
<span class="sd"> Whether to get k-1 dummies out of k categorical levels by removing the</span>
<span class="sd"> first level.</span>
<span class="sd"> dtype : dtype, default np.uint8</span>
<span class="sd"> Data type for new columns. Only a single dtype is allowed.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> dummies : DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.str.get_dummies</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series(list(&#39;abca&#39;))</span>
<span class="sd"> &gt;&gt;&gt; ps.get_dummies(s)</span>
<span class="sd"> a b c</span>
<span class="sd"> 0 1 0 0</span>
<span class="sd"> 1 0 1 0</span>
<span class="sd"> 2 0 0 1</span>
<span class="sd"> 3 1 0 0</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [&#39;a&#39;, &#39;b&#39;, &#39;a&#39;], &#39;B&#39;: [&#39;b&#39;, &#39;a&#39;, &#39;c&#39;],</span>
<span class="sd"> ... &#39;C&#39;: [1, 2, 3]},</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;])</span>
<span class="sd"> &gt;&gt;&gt; ps.get_dummies(df, prefix=[&#39;col1&#39;, &#39;col2&#39;])</span>
<span class="sd"> C col1_a col1_b col2_a col2_b col2_c</span>
<span class="sd"> 0 1 1 0 0 1 0</span>
<span class="sd"> 1 2 0 1 1 0 0</span>
<span class="sd"> 2 3 1 0 0 0 1</span>
<span class="sd"> &gt;&gt;&gt; ps.get_dummies(ps.Series(list(&#39;abcaa&#39;)))</span>
<span class="sd"> a b c</span>
<span class="sd"> 0 1 0 0</span>
<span class="sd"> 1 0 1 0</span>
<span class="sd"> 2 0 0 1</span>
<span class="sd"> 3 1 0 0</span>
<span class="sd"> 4 1 0 0</span>
<span class="sd"> &gt;&gt;&gt; ps.get_dummies(ps.Series(list(&#39;abcaa&#39;)), drop_first=True)</span>
<span class="sd"> b c</span>
<span class="sd"> 0 0 0</span>
<span class="sd"> 1 1 0</span>
<span class="sd"> 2 0 1</span>
<span class="sd"> 3 0 0</span>
<span class="sd"> 4 0 0</span>
<span class="sd"> &gt;&gt;&gt; ps.get_dummies(ps.Series(list(&#39;abc&#39;)), dtype=float)</span>
<span class="sd"> a b c</span>
<span class="sd"> 0 1.0 0.0 0.0</span>
<span class="sd"> 1 0.0 1.0 0.0</span>
<span class="sd"> 2 0.0 0.0 1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">sparse</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">False</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;get_dummies currently does not support sparse&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">columns</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;Input must be a list-like for parameter `columns`&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">dtype</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">dtype</span> <span class="o">=</span> <span class="s2">&quot;byte&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="k">if</span> <span class="n">prefix</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">prefix</span> <span class="o">=</span> <span class="p">[</span><span class="nb">str</span><span class="p">(</span><span class="n">prefix</span><span class="p">)]</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="n">remaining_columns</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">prefix</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span>
<span class="s2">&quot;get_dummies currently does not support prefix as string types&quot;</span>
<span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">label</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_type_for</span><span class="p">(</span><span class="n">label</span><span class="p">),</span> <span class="n">_get_dummies_default_accept_types</span>
<span class="p">)</span>
<span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">columns</span><span class="p">):</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">label</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">if</span> <span class="n">label</span><span class="p">[:</span> <span class="nb">len</span><span class="p">(</span><span class="n">columns</span><span class="p">)]</span> <span class="o">==</span> <span class="n">columns</span>
<span class="p">]</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">columns</span><span class="p">))</span>
<span class="k">if</span> <span class="n">prefix</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">prefix</span> <span class="o">=</span> <span class="p">[</span>
<span class="nb">str</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="nb">len</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span> <span class="p">:])</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">&gt;</span> <span class="nb">len</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span> <span class="o">+</span> <span class="mi">1</span>
<span class="k">else</span> <span class="n">label</span><span class="p">[</span><span class="nb">len</span><span class="p">(</span><span class="n">columns</span><span class="p">)]</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">==</span> <span class="nb">len</span><span class="p">(</span><span class="n">columns</span><span class="p">)</span> <span class="o">+</span> <span class="mi">1</span>
<span class="k">else</span> <span class="s2">&quot;&quot;</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span>
<span class="p">]</span>
<span class="k">elif</span> <span class="nb">any</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">columns</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">any</span><span class="p">(</span>
<span class="ow">not</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">columns</span>
<span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Expected tuple, got </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="nb">type</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">columns</span> <span class="k">if</span> <span class="ow">not</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">col</span><span class="p">))</span><span class="o">.</span><span class="n">pop</span><span class="p">())</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">label</span>
<span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">columns</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">if</span> <span class="n">label</span> <span class="o">==</span> <span class="n">key</span> <span class="ow">or</span> <span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">==</span> <span class="n">key</span>
<span class="p">]</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">psdf</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="s2">&quot;</span><span class="si">{}</span><span class="s2"> not in index&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">columns</span><span class="p">))</span>
<span class="k">if</span> <span class="n">prefix</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">prefix</span> <span class="o">=</span> <span class="p">[</span><span class="nb">str</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span> <span class="k">else</span> <span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">]</span>
<span class="n">column_labels_set</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span>
<span class="n">remaining_columns</span> <span class="o">=</span> <span class="p">[</span>
<span class="p">(</span>
<span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span>
<span class="k">if</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="o">==</span> <span class="mi">1</span>
<span class="k">else</span> <span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">column_labels_set</span>
<span class="p">]</span>
<span class="k">if</span> <span class="nb">any</span><span class="p">(</span>
<span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_type_for</span><span class="p">(</span><span class="n">label</span><span class="p">),</span> <span class="n">_get_dummies_acceptable_types</span><span class="p">)</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span>
<span class="p">):</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span>
<span class="s2">&quot;get_dummies currently only accept </span><span class="si">{}</span><span class="s2"> values&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="s2">&quot;, &quot;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span>
<span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Type</span><span class="p">[</span><span class="n">DataType</span><span class="p">],</span> <span class="n">t</span><span class="p">)</span><span class="o">.</span><span class="n">typeName</span><span class="p">()</span> <span class="k">for</span> <span class="n">t</span> <span class="ow">in</span> <span class="n">_get_dummies_acceptable_types</span><span class="p">]</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">prefix</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">prefix</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Length of &#39;prefix&#39; (</span><span class="si">{}</span><span class="s2">) did not match the length of &quot;</span>
<span class="s2">&quot;the columns being encoded (</span><span class="si">{}</span><span class="s2">).&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">prefix</span><span class="p">),</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels</span><span class="p">))</span>
<span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">prefix</span><span class="p">,</span> <span class="nb">dict</span><span class="p">):</span>
<span class="n">prefix</span> <span class="o">=</span> <span class="p">[</span><span class="n">prefix</span><span class="p">[</span><span class="n">column_label</span><span class="p">[</span><span class="mi">0</span><span class="p">]]</span> <span class="k">for</span> <span class="n">column_label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">]</span>
<span class="n">all_values</span> <span class="o">=</span> <span class="n">_reduce_spark_multi</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="p">,</span>
<span class="p">[</span><span class="n">F</span><span class="o">.</span><span class="n">collect_set</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">],</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">column_labels</span><span class="p">):</span>
<span class="n">values</span> <span class="o">=</span> <span class="n">all_values</span><span class="p">[</span><span class="n">i</span><span class="p">]</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">):</span>
<span class="n">values</span> <span class="o">=</span> <span class="n">values</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span>
<span class="n">values</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">values</span><span class="p">)</span>
<span class="k">if</span> <span class="n">drop_first</span><span class="p">:</span>
<span class="n">values</span> <span class="o">=</span> <span class="n">values</span><span class="p">[</span><span class="mi">1</span><span class="p">:]</span>
<span class="k">def</span> <span class="nf">column_name</span><span class="p">(</span><span class="n">v</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Name</span><span class="p">:</span>
<span class="k">if</span> <span class="n">prefix</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="n">cast</span><span class="p">(</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">prefix</span><span class="p">)[</span><span class="n">i</span><span class="p">]</span> <span class="o">==</span> <span class="s2">&quot;&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">v</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="s2">&quot;</span><span class="si">{}{}{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">prefix</span><span class="p">)[</span><span class="n">i</span><span class="p">],</span> <span class="n">prefix_sep</span><span class="p">,</span> <span class="n">v</span><span class="p">)</span>
<span class="k">for</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">values</span><span class="p">:</span>
<span class="n">remaining_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
<span class="p">(</span><span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span><span class="o">.</span><span class="n">notnull</span><span class="p">()</span> <span class="o">&amp;</span> <span class="p">(</span><span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span> <span class="o">==</span> <span class="n">value</span><span class="p">))</span>
<span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span>
<span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">column_name</span><span class="p">(</span><span class="n">value</span><span class="p">))</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">dummy_na</span><span class="p">:</span>
<span class="n">remaining_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span><span class="o">.</span><span class="n">isnull</span><span class="p">()</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">column_name</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">nan</span><span class="p">)))</span>
<span class="k">return</span> <span class="n">psdf</span><span class="p">[</span><span class="n">remaining_columns</span><span class="p">]</span></div>
<span class="c1"># TODO: there are many parameters to implement and support. See pandas&#39;s pd.concat.</span>
<div class="viewcode-block" id="concat"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.concat.html#pyspark.pandas.concat">[docs]</a><span class="k">def</span> <span class="nf">concat</span><span class="p">(</span>
<span class="n">objs</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">]],</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
<span class="n">join</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;outer&quot;</span><span class="p">,</span>
<span class="n">ignore_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">sort</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Concatenate pandas-on-Spark objects along a particular axis with optional set logic</span>
<span class="sd"> along the other axes.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> objs : a sequence of Series or DataFrame</span>
<span class="sd"> Any None objects will be dropped silently unless</span>
<span class="sd"> they are all None in which case a ValueError will be raised</span>
<span class="sd"> axis : {0/&#39;index&#39;, 1/&#39;columns&#39;}, default 0</span>
<span class="sd"> The axis to concatenate along.</span>
<span class="sd"> join : {&#39;inner&#39;, &#39;outer&#39;}, default &#39;outer&#39;</span>
<span class="sd"> How to handle indexes on other axis (or axes).</span>
<span class="sd"> ignore_index : bool, default False</span>
<span class="sd"> If True, do not use the index values along the concatenation axis. The</span>
<span class="sd"> resulting axis will be labeled 0, ..., n - 1. This is useful if you are</span>
<span class="sd"> concatenating objects where the concatenation axis does not have</span>
<span class="sd"> meaningful indexing information. Note the index values on the other</span>
<span class="sd"> axes are still respected in the join.</span>
<span class="sd"> sort : bool, default False</span>
<span class="sd"> Sort non-concatenation axis if it is not already aligned.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> object, type of objs</span>
<span class="sd"> When concatenating all ``Series`` along the index (axis=0), a</span>
<span class="sd"> ``Series`` is returned. When ``objs`` contains at least one</span>
<span class="sd"> ``DataFrame``, a ``DataFrame`` is returned. When concatenating along</span>
<span class="sd"> the columns (axis=1), a ``DataFrame`` is returned.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.append : Concatenate Series.</span>
<span class="sd"> DataFrame.join : Join DataFrames using indexes.</span>
<span class="sd"> DataFrame.merge : Merge DataFrames by indexes or columns.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.pandas.config import set_option, reset_option</span>
<span class="sd"> &gt;&gt;&gt; set_option(&quot;compute.ops_on_diff_frames&quot;, True)</span>
<span class="sd"> Combine two ``Series``.</span>
<span class="sd"> &gt;&gt;&gt; s1 = ps.Series([&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; s2 = ps.Series([&#39;c&#39;, &#39;d&#39;])</span>
<span class="sd"> &gt;&gt;&gt; ps.concat([s1, s2])</span>
<span class="sd"> 0 a</span>
<span class="sd"> 1 b</span>
<span class="sd"> 0 c</span>
<span class="sd"> 1 d</span>
<span class="sd"> dtype: object</span>
<span class="sd"> Clear the existing index and reset it in the result</span>
<span class="sd"> by setting the ``ignore_index`` option to ``True``.</span>
<span class="sd"> &gt;&gt;&gt; ps.concat([s1, s2], ignore_index=True)</span>
<span class="sd"> 0 a</span>
<span class="sd"> 1 b</span>
<span class="sd"> 2 c</span>
<span class="sd"> 3 d</span>
<span class="sd"> dtype: object</span>
<span class="sd"> Combine two ``DataFrame`` objects with identical columns.</span>
<span class="sd"> &gt;&gt;&gt; df1 = ps.DataFrame([[&#39;a&#39;, 1], [&#39;b&#39;, 2]],</span>
<span class="sd"> ... columns=[&#39;letter&#39;, &#39;number&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df1</span>
<span class="sd"> letter number</span>
<span class="sd"> 0 a 1</span>
<span class="sd"> 1 b 2</span>
<span class="sd"> &gt;&gt;&gt; df2 = ps.DataFrame([[&#39;c&#39;, 3], [&#39;d&#39;, 4]],</span>
<span class="sd"> ... columns=[&#39;letter&#39;, &#39;number&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df2</span>
<span class="sd"> letter number</span>
<span class="sd"> 0 c 3</span>
<span class="sd"> 1 d 4</span>
<span class="sd"> &gt;&gt;&gt; ps.concat([df1, df2])</span>
<span class="sd"> letter number</span>
<span class="sd"> 0 a 1</span>
<span class="sd"> 1 b 2</span>
<span class="sd"> 0 c 3</span>
<span class="sd"> 1 d 4</span>
<span class="sd"> Combine ``DataFrame`` and ``Series`` objects with different columns.</span>
<span class="sd"> &gt;&gt;&gt; ps.concat([df2, s1])</span>
<span class="sd"> letter number 0</span>
<span class="sd"> 0 c 3.0 None</span>
<span class="sd"> 1 d 4.0 None</span>
<span class="sd"> 0 None NaN a</span>
<span class="sd"> 1 None NaN b</span>
<span class="sd"> Combine ``DataFrame`` objects with overlapping columns</span>
<span class="sd"> and return everything. Columns outside the intersection will</span>
<span class="sd"> be filled with ``None`` values.</span>
<span class="sd"> &gt;&gt;&gt; df3 = ps.DataFrame([[&#39;c&#39;, 3, &#39;cat&#39;], [&#39;d&#39;, 4, &#39;dog&#39;]],</span>
<span class="sd"> ... columns=[&#39;letter&#39;, &#39;number&#39;, &#39;animal&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df3</span>
<span class="sd"> letter number animal</span>
<span class="sd"> 0 c 3 cat</span>
<span class="sd"> 1 d 4 dog</span>
<span class="sd"> &gt;&gt;&gt; ps.concat([df1, df3])</span>
<span class="sd"> letter number animal</span>
<span class="sd"> 0 a 1 None</span>
<span class="sd"> 1 b 2 None</span>
<span class="sd"> 0 c 3 cat</span>
<span class="sd"> 1 d 4 dog</span>
<span class="sd"> Sort the columns.</span>
<span class="sd"> &gt;&gt;&gt; ps.concat([df1, df3], sort=True)</span>
<span class="sd"> animal letter number</span>
<span class="sd"> 0 None a 1</span>
<span class="sd"> 1 None b 2</span>
<span class="sd"> 0 cat c 3</span>
<span class="sd"> 1 dog d 4</span>
<span class="sd"> Combine ``DataFrame`` objects with overlapping columns</span>
<span class="sd"> and return only those that are shared by passing ``inner`` to</span>
<span class="sd"> the ``join`` keyword argument.</span>
<span class="sd"> &gt;&gt;&gt; ps.concat([df1, df3], join=&quot;inner&quot;)</span>
<span class="sd"> letter number</span>
<span class="sd"> 0 a 1</span>
<span class="sd"> 1 b 2</span>
<span class="sd"> 0 c 3</span>
<span class="sd"> 1 d 4</span>
<span class="sd"> &gt;&gt;&gt; df4 = ps.DataFrame([[&#39;bird&#39;, &#39;polly&#39;], [&#39;monkey&#39;, &#39;george&#39;]],</span>
<span class="sd"> ... columns=[&#39;animal&#39;, &#39;name&#39;])</span>
<span class="sd"> Combine with column axis.</span>
<span class="sd"> &gt;&gt;&gt; ps.concat([df1, df4], axis=1)</span>
<span class="sd"> letter number animal name</span>
<span class="sd"> 0 a 1 bird polly</span>
<span class="sd"> 1 b 2 monkey george</span>
<span class="sd"> &gt;&gt;&gt; reset_option(&quot;compute.ops_on_diff_frames&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">objs</span><span class="p">,</span> <span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">IndexOpsMixin</span><span class="p">))</span> <span class="ow">or</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span>
<span class="n">objs</span><span class="p">,</span> <span class="n">Iterable</span>
<span class="p">):</span> <span class="c1"># TODO: support dict</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;first argument must be an iterable of pandas-on-Spark &quot;</span>
<span class="s2">&quot;objects, you passed an object of type &quot;</span>
<span class="s1">&#39;&quot;</span><span class="si">{name}</span><span class="s1">&quot;&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="nb">type</span><span class="p">(</span><span class="n">objs</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">Sized</span><span class="p">,</span> <span class="n">objs</span><span class="p">))</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;No objects to concatenate&quot;</span><span class="p">)</span>
<span class="n">objs</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">obj</span><span class="p">:</span> <span class="n">obj</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">,</span> <span class="n">objs</span><span class="p">))</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">objs</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;All objects passed were None&quot;</span><span class="p">)</span>
<span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">objs</span><span class="p">:</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="p">(</span><span class="n">Series</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">)):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;cannot concatenate object of type &quot;</span>
<span class="s2">&quot;&#39;</span><span class="si">{name}</span><span class="s2">&quot;</span>
<span class="s2">&quot;; only ps.Series &quot;</span>
<span class="s2">&quot;and ps.DataFrame are valid&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="nb">type</span><span class="p">(</span><span class="n">objs</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">join</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;inner&quot;</span><span class="p">,</span> <span class="s2">&quot;outer&quot;</span><span class="p">]:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Only can inner (intersect) or outer (union) join the other axis.&quot;</span><span class="p">)</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span>
<span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">psdfs</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">obj</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="k">else</span> <span class="n">obj</span> <span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">objs</span>
<span class="p">]</span>
<span class="n">level</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">psdfs</span><span class="p">)</span>
<span class="n">psdfs</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">DataFrame</span><span class="o">.</span><span class="n">_index_normalized_frame</span><span class="p">(</span><span class="n">level</span><span class="p">,</span> <span class="n">psdf</span><span class="p">)</span>
<span class="k">if</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="o">&gt;</span> <span class="n">level</span>
<span class="k">else</span> <span class="n">psdf</span>
<span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">psdfs</span>
<span class="p">]</span>
<span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">column_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span> <span class="o">=</span> <span class="n">concat_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">psdfs_not_same_anchor</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">psdfs</span><span class="p">[</span><span class="mi">1</span><span class="p">:]:</span>
<span class="n">duplicated</span> <span class="o">=</span> <span class="p">[</span><span class="n">label</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels</span><span class="p">]</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">duplicated</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">pretty_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">duplicated</span><span class="p">]</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Labels have to be unique; however, got duplicated labels </span><span class="si">%s</span><span class="s2">.&quot;</span> <span class="o">%</span> <span class="n">pretty_names</span>
<span class="p">)</span>
<span class="n">column_labels</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span>
<span class="k">if</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">concat_psdf</span><span class="p">,</span> <span class="n">psdf</span><span class="p">):</span>
<span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">concat_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span>
<span class="p">[</span>
<span class="n">concat_psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">concat_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="p">]</span>
<span class="o">+</span> <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">]</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">psdfs_not_same_anchor</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">psdfs_not_same_anchor</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="nd">@no_type_check</span>
<span class="k">def</span> <span class="nf">resolve_func</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">this_column_labels</span><span class="p">,</span> <span class="n">that_column_labels</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">AssertionError</span><span class="p">(</span><span class="s2">&quot;This should not happen.&quot;</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">psdfs_not_same_anchor</span><span class="p">:</span>
<span class="k">if</span> <span class="n">join</span> <span class="o">==</span> <span class="s2">&quot;inner&quot;</span><span class="p">:</span>
<span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">align_diff_frames</span><span class="p">(</span>
<span class="n">resolve_func</span><span class="p">,</span>
<span class="n">concat_psdf</span><span class="p">,</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="n">fillna</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">how</span><span class="o">=</span><span class="s2">&quot;inner&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">elif</span> <span class="n">join</span> <span class="o">==</span> <span class="s2">&quot;outer&quot;</span><span class="p">:</span>
<span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">align_diff_frames</span><span class="p">(</span>
<span class="n">resolve_func</span><span class="p">,</span>
<span class="n">concat_psdf</span><span class="p">,</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="n">fillna</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">how</span><span class="o">=</span><span class="s2">&quot;full&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">concat_psdf</span><span class="p">[</span><span class="n">column_labels</span><span class="p">]</span>
<span class="k">if</span> <span class="n">ignore_index</span><span class="p">:</span>
<span class="n">concat_psdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span> <span class="c1"># type: ignore[assignment]</span>
<span class="nb">map</span><span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">_range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">concat_psdf</span><span class="o">.</span><span class="n">columns</span><span class="p">)))</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">sort</span><span class="p">:</span>
<span class="n">concat_psdf</span> <span class="o">=</span> <span class="n">concat_psdf</span><span class="o">.</span><span class="n">sort_index</span><span class="p">()</span>
<span class="k">return</span> <span class="n">concat_psdf</span>
<span class="c1"># Series, Series ...</span>
<span class="c1"># We should return Series if objects are all Series.</span>
<span class="n">should_return_series</span> <span class="o">=</span> <span class="nb">all</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">obj</span><span class="p">:</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">Series</span><span class="p">),</span> <span class="n">objs</span><span class="p">))</span>
<span class="c1"># DataFrame, Series ... &amp; Series, Series ...</span>
<span class="c1"># In this case, we should return DataFrame.</span>
<span class="n">new_objs</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">num_series</span> <span class="o">=</span> <span class="mi">0</span>
<span class="n">series_names</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
<span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">objs</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="n">num_series</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="n">series_names</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">obj</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="n">new_objs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">obj</span><span class="o">.</span><span class="n">to_frame</span><span class="p">(</span><span class="n">DEFAULT_SERIES_NAME</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">)</span>
<span class="n">new_objs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span>
<span class="n">column_labels_levels</span><span class="p">:</span> <span class="n">Set</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">obj</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="k">for</span> <span class="n">obj</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">column_labels_levels</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;MultiIndex columns should have the same levels&quot;</span><span class="p">)</span>
<span class="c1"># DataFrame, DataFrame, ...</span>
<span class="c1"># All Series are converted into DataFrame and then compute concat.</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">ignore_index</span><span class="p">:</span>
<span class="n">indices_of_psdfs</span> <span class="o">=</span> <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">index</span> <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">]</span>
<span class="n">index_of_first_psdf</span> <span class="o">=</span> <span class="n">indices_of_psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">for</span> <span class="n">index_of_psdf</span> <span class="ow">in</span> <span class="n">indices_of_psdfs</span><span class="p">:</span>
<span class="k">if</span> <span class="n">index_of_first_psdf</span><span class="o">.</span><span class="n">names</span> <span class="o">!=</span> <span class="n">index_of_psdf</span><span class="o">.</span><span class="n">names</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Index type and names should be same in the objects to concatenate. &quot;</span>
<span class="s2">&quot;You passed different indices &quot;</span>
<span class="s2">&quot;</span><span class="si">{index_of_first_psdf}</span><span class="s2"> and </span><span class="si">{index_of_psdf}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">index_of_first_psdf</span><span class="o">=</span><span class="n">index_of_first_psdf</span><span class="o">.</span><span class="n">names</span><span class="p">,</span>
<span class="n">index_of_psdf</span><span class="o">=</span><span class="n">index_of_psdf</span><span class="o">.</span><span class="n">names</span><span class="p">,</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="n">column_labels_of_psdfs</span> <span class="o">=</span> <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">]</span>
<span class="n">index_names_of_psdfs</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="n">Label</span><span class="p">]]]</span>
<span class="k">if</span> <span class="n">ignore_index</span><span class="p">:</span>
<span class="n">index_names_of_psdfs</span> <span class="o">=</span> <span class="p">[[]</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">index_names_of_psdfs</span> <span class="o">=</span> <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span> <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">]</span>
<span class="k">if</span> <span class="nb">all</span><span class="p">(</span><span class="n">name</span> <span class="o">==</span> <span class="n">index_names_of_psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">index_names_of_psdfs</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">all</span><span class="p">(</span>
<span class="n">idx</span> <span class="o">==</span> <span class="n">column_labels_of_psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">idx</span> <span class="ow">in</span> <span class="n">column_labels_of_psdfs</span>
<span class="p">):</span>
<span class="c1"># If all columns are in the same order and values, use it.</span>
<span class="n">psdfs</span> <span class="o">=</span> <span class="n">new_objs</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="n">join</span> <span class="o">==</span> <span class="s2">&quot;inner&quot;</span><span class="p">:</span>
<span class="n">interested_columns</span> <span class="o">=</span> <span class="nb">set</span><span class="o">.</span><span class="n">intersection</span><span class="p">(</span><span class="o">*</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="nb">set</span><span class="p">(</span><span class="n">x</span><span class="p">),</span> <span class="n">column_labels_of_psdfs</span><span class="p">))</span>
<span class="c1"># Keep the column order with its firsts DataFrame.</span>
<span class="n">merged_columns</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">label</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels_of_psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">interested_columns</span>
<span class="p">]</span>
<span class="c1"># When multi-index column, although pandas is flaky if `join=&quot;inner&quot; and sort=False`,</span>
<span class="c1"># always sort to follow the `join=&quot;outer&quot;` case behavior.</span>
<span class="k">if</span> <span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">merged_columns</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">merged_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">)</span> <span class="ow">or</span> <span class="n">sort</span><span class="p">:</span>
<span class="c1"># FIXME: better ordering</span>
<span class="n">merged_columns</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">merged_columns</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="n">name_like_string</span><span class="p">)</span>
<span class="n">psdfs</span> <span class="o">=</span> <span class="p">[</span><span class="n">psdf</span><span class="p">[</span><span class="n">merged_columns</span><span class="p">]</span> <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">]</span>
<span class="k">elif</span> <span class="n">join</span> <span class="o">==</span> <span class="s2">&quot;outer&quot;</span><span class="p">:</span>
<span class="n">merged_columns</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">labels</span> <span class="ow">in</span> <span class="n">column_labels_of_psdfs</span><span class="p">:</span>
<span class="n">merged_columns</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">label</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">labels</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">merged_columns</span><span class="p">)</span>
<span class="k">assert</span> <span class="nb">len</span><span class="p">(</span><span class="n">merged_columns</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span>
<span class="c1"># Always sort when multi-index columns or there are more than two Series,</span>
<span class="c1"># and if there is only one Series, never sort.</span>
<span class="n">sort</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">merged_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="o">&gt;</span> <span class="mi">1</span> <span class="ow">or</span> <span class="n">num_series</span> <span class="o">&gt;</span> <span class="mi">1</span> <span class="ow">or</span> <span class="p">(</span><span class="n">num_series</span> <span class="o">!=</span> <span class="mi">1</span> <span class="ow">and</span> <span class="n">sort</span><span class="p">)</span>
<span class="k">if</span> <span class="n">sort</span><span class="p">:</span>
<span class="c1"># FIXME: better ordering</span>
<span class="n">merged_columns</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">merged_columns</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="n">name_like_string</span><span class="p">)</span>
<span class="n">psdfs</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">new_objs</span><span class="p">:</span>
<span class="n">columns_to_add</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">set</span><span class="p">(</span><span class="n">merged_columns</span><span class="p">)</span> <span class="o">-</span> <span class="nb">set</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">))</span>
<span class="c1"># TODO: NaN and None difference for missing values. pandas seems filling NaN.</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">columns_to_add</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">),</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">))</span>
<span class="n">data_columns</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span> <span class="o">+</span> <span class="p">[</span>
<span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">columns_to_add</span>
<span class="p">]</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span>
<span class="p">],</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> <span class="o">+</span> <span class="n">columns_to_add</span><span class="p">),</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span> <span class="o">+</span> <span class="p">([</span><span class="kc">None</span><span class="p">]</span> <span class="o">*</span> <span class="nb">len</span><span class="p">(</span><span class="n">columns_to_add</span><span class="p">))),</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="n">psdfs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psdf</span><span class="p">[</span><span class="n">merged_columns</span><span class="p">])</span>
<span class="k">if</span> <span class="n">ignore_index</span><span class="p">:</span>
<span class="n">sdfs</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">)</span> <span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">psdfs</span>
<span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdfs</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span> <span class="o">+</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">psdf</span> <span class="ow">in</span> <span class="n">psdfs</span>
<span class="p">]</span>
<span class="n">concatenated</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">union</span><span class="p">(</span><span class="n">y</span><span class="p">),</span> <span class="n">sdfs</span><span class="p">)</span>
<span class="k">if</span> <span class="n">ignore_index</span><span class="p">:</span>
<span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">index_fields</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="n">psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="n">psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span>
<span class="n">index_fields</span> <span class="o">=</span> <span class="n">psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span>
<span class="n">result_psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">concatenated</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">concatenated</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_spark_column_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span>
<span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">concatenated</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">psdfs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span>
<span class="p">],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="c1"># TODO: dtypes?</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">should_return_series</span><span class="p">:</span>
<span class="c1"># If all input were Series, we should return Series.</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">series_names</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">name</span> <span class="o">=</span> <span class="n">series_names</span><span class="o">.</span><span class="n">pop</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">name</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">result_psdf</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">name</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">result_psdf</span></div>
<div class="viewcode-block" id="melt"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.melt.html#pyspark.pandas.melt">[docs]</a><span class="k">def</span> <span class="nf">melt</span><span class="p">(</span>
<span class="n">frame</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">id_vars</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">value_vars</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">var_name</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">value_name</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;value&quot;</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">melt</span><span class="p">(</span><span class="n">frame</span><span class="p">,</span> <span class="n">id_vars</span><span class="p">,</span> <span class="n">value_vars</span><span class="p">,</span> <span class="n">var_name</span><span class="p">,</span> <span class="n">value_name</span><span class="p">)</span></div>
<span class="n">melt</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">melt</span><span class="o">.</span><span class="vm">__doc__</span>
<div class="viewcode-block" id="isna"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.isna.html#pyspark.pandas.isna">[docs]</a><span class="nd">@no_type_check</span>
<span class="k">def</span> <span class="nf">isna</span><span class="p">(</span><span class="n">obj</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Detect missing values for an array-like object.</span>
<span class="sd"> This function takes a scalar or array-like object and indicates</span>
<span class="sd"> whether values are missing (``NaN`` in numeric arrays, ``None`` or ``NaN``</span>
<span class="sd"> in object arrays).</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> obj : scalar or array-like</span>
<span class="sd"> Object to check for null or missing values.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> bool or array-like of bool</span>
<span class="sd"> For scalar input, returns a scalar boolean.</span>
<span class="sd"> For array input, returns an array of boolean indicating whether each</span>
<span class="sd"> corresponding element is missing.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.isna : Detect missing values in a Series.</span>
<span class="sd"> Series.isnull : Detect missing values in a Series.</span>
<span class="sd"> DataFrame.isna : Detect missing values in a DataFrame.</span>
<span class="sd"> DataFrame.isnull : Detect missing values in a DataFrame.</span>
<span class="sd"> Index.isna : Detect missing values in an Index.</span>
<span class="sd"> Index.isnull : Detect missing values in an Index.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Scalar arguments (including strings) result in a scalar boolean.</span>
<span class="sd"> &gt;&gt;&gt; ps.isna(&#39;dog&#39;)</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.isna(np.nan)</span>
<span class="sd"> True</span>
<span class="sd"> ndarrays result in an ndarray of booleans.</span>
<span class="sd"> &gt;&gt;&gt; array = np.array([[1, np.nan, 3], [4, 5, np.nan]])</span>
<span class="sd"> &gt;&gt;&gt; array</span>
<span class="sd"> array([[ 1., nan, 3.],</span>
<span class="sd"> [ 4., 5., nan]])</span>
<span class="sd"> &gt;&gt;&gt; ps.isna(array)</span>
<span class="sd"> array([[False, True, False],</span>
<span class="sd"> [False, False, True]])</span>
<span class="sd"> For Series and DataFrame, the same type is returned, containing booleans.</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [&#39;ant&#39;, &#39;bee&#39;, &#39;cat&#39;], &#39;b&#39;: [&#39;dog&#39;, None, &#39;fly&#39;]})</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b</span>
<span class="sd"> 0 ant dog</span>
<span class="sd"> 1 bee None</span>
<span class="sd"> 2 cat fly</span>
<span class="sd"> &gt;&gt;&gt; ps.isna(df)</span>
<span class="sd"> a b</span>
<span class="sd"> 0 False False</span>
<span class="sd"> 1 False True</span>
<span class="sd"> 2 False False</span>
<span class="sd"> &gt;&gt;&gt; ps.isnull(df.b)</span>
<span class="sd"> 0 False</span>
<span class="sd"> 1 True</span>
<span class="sd"> 2 False</span>
<span class="sd"> Name: b, dtype: bool</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># TODO: Add back:</span>
<span class="c1"># notnull : Boolean inverse of pandas.isnull.</span>
<span class="c1"># into the See Also in the docstring. It does not find the method in the latest numpydoc.</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">)):</span>
<span class="k">return</span> <span class="n">obj</span><span class="o">.</span><span class="n">isnull</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">isnull</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span></div>
<span class="n">isnull</span> <span class="o">=</span> <span class="n">isna</span>
<div class="viewcode-block" id="notna"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.notna.html#pyspark.pandas.notna">[docs]</a><span class="nd">@no_type_check</span>
<span class="k">def</span> <span class="nf">notna</span><span class="p">(</span><span class="n">obj</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Detect existing (non-missing) values.</span>
<span class="sd"> Return a boolean same-sized object indicating if the values are not NA.</span>
<span class="sd"> Non-missing values get mapped to True. NA values, such as None or</span>
<span class="sd"> :attr:`numpy.NaN`, get mapped to False values.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> bool or array-like of bool</span>
<span class="sd"> Mask of bool values for each element that</span>
<span class="sd"> indicates whether an element is not an NA value.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> isna : Detect missing values for an array-like object.</span>
<span class="sd"> Series.notna : Boolean inverse of Series.isna.</span>
<span class="sd"> DataFrame.notnull : Boolean inverse of DataFrame.isnull.</span>
<span class="sd"> Index.notna : Boolean inverse of Index.isna.</span>
<span class="sd"> Index.notnull : Boolean inverse of Index.isnull.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Show which entries in a DataFrame are not NA.</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;age&#39;: [5, 6, np.NaN],</span>
<span class="sd"> ... &#39;born&#39;: [pd.NaT, pd.Timestamp(&#39;1939-05-27&#39;),</span>
<span class="sd"> ... pd.Timestamp(&#39;1940-04-25&#39;)],</span>
<span class="sd"> ... &#39;name&#39;: [&#39;Alfred&#39;, &#39;Batman&#39;, &#39;&#39;],</span>
<span class="sd"> ... &#39;toy&#39;: [None, &#39;Batmobile&#39;, &#39;Joker&#39;]})</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> age born name toy</span>
<span class="sd"> 0 5.0 NaT Alfred None</span>
<span class="sd"> 1 6.0 1939-05-27 Batman Batmobile</span>
<span class="sd"> 2 NaN 1940-04-25 Joker</span>
<span class="sd"> &gt;&gt;&gt; df.notnull()</span>
<span class="sd"> age born name toy</span>
<span class="sd"> 0 True False True False</span>
<span class="sd"> 1 True True True True</span>
<span class="sd"> 2 False True True True</span>
<span class="sd"> Show which entries in a Series are not NA.</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([5, 6, np.NaN])</span>
<span class="sd"> &gt;&gt;&gt; ser</span>
<span class="sd"> 0 5.0</span>
<span class="sd"> 1 6.0</span>
<span class="sd"> 2 NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; ps.notna(ser)</span>
<span class="sd"> 0 True</span>
<span class="sd"> 1 True</span>
<span class="sd"> 2 False</span>
<span class="sd"> dtype: bool</span>
<span class="sd"> &gt;&gt;&gt; ps.notna(ser.index)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># TODO: Add back:</span>
<span class="c1"># Series.notnull :Boolean inverse of Series.isnull.</span>
<span class="c1"># DataFrame.notna :Boolean inverse of DataFrame.isna.</span>
<span class="c1"># into the See Also in the docstring. It does not find the method in the latest numpydoc.</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">)):</span>
<span class="k">return</span> <span class="n">obj</span><span class="o">.</span><span class="n">notna</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">notna</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span></div>
<span class="n">notnull</span> <span class="o">=</span> <span class="n">notna</span>
<div class="viewcode-block" id="merge"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.merge.html#pyspark.pandas.merge">[docs]</a><span class="k">def</span> <span class="nf">merge</span><span class="p">(</span>
<span class="n">obj</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">right</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">how</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;inner&quot;</span><span class="p">,</span>
<span class="n">on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">left_on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">right_on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">left_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">right_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">suffixes</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="s2">&quot;_x&quot;</span><span class="p">,</span> <span class="s2">&quot;_y&quot;</span><span class="p">),</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Merge DataFrame objects with a database-style join.</span>
<span class="sd"> The index of the resulting DataFrame will be one of the following:</span>
<span class="sd"> - 0...n if no index is used for merging</span>
<span class="sd"> - Index of the left DataFrame if merged only on the index of the right DataFrame</span>
<span class="sd"> - Index of the right DataFrame if merged only on the index of the left DataFrame</span>
<span class="sd"> - All involved indices if merged using the indices of both DataFrames</span>
<span class="sd"> e.g. if `left` with indices (a, x) and `right` with indices (b, x), the result will</span>
<span class="sd"> be an index (x, a, b)</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> right: Object to merge with.</span>
<span class="sd"> how: Type of merge to be performed.</span>
<span class="sd"> {&#39;left&#39;, &#39;right&#39;, &#39;outer&#39;, &#39;inner&#39;}, default &#39;inner&#39;</span>
<span class="sd"> left: use only keys from left frame, similar to a SQL left outer join; preserve key</span>
<span class="sd"> order.</span>
<span class="sd"> right: use only keys from right frame, similar to a SQL right outer join; preserve key</span>
<span class="sd"> order.</span>
<span class="sd"> outer: use union of keys from both frames, similar to a SQL full outer join; sort keys</span>
<span class="sd"> lexicographically.</span>
<span class="sd"> inner: use intersection of keys from both frames, similar to a SQL inner join;</span>
<span class="sd"> preserve the order of the left keys.</span>
<span class="sd"> on: Column or index level names to join on. These must be found in both DataFrames. If on</span>
<span class="sd"> is None and not merging on indexes then this defaults to the intersection of the</span>
<span class="sd"> columns in both DataFrames.</span>
<span class="sd"> left_on: Column or index level names to join on in the left DataFrame. Can also</span>
<span class="sd"> be an array or list of arrays of the length of the left DataFrame.</span>
<span class="sd"> These arrays are treated as if they are columns.</span>
<span class="sd"> right_on: Column or index level names to join on in the right DataFrame. Can also</span>
<span class="sd"> be an array or list of arrays of the length of the right DataFrame.</span>
<span class="sd"> These arrays are treated as if they are columns.</span>
<span class="sd"> left_index: Use the index from the left DataFrame as the join key(s). If it is a</span>
<span class="sd"> MultiIndex, the number of keys in the other DataFrame (either the index or a number of</span>
<span class="sd"> columns) must match the number of levels.</span>
<span class="sd"> right_index: Use the index from the right DataFrame as the join key. Same caveats as</span>
<span class="sd"> left_index.</span>
<span class="sd"> suffixes: Suffix to apply to overlapping column names in the left and right side,</span>
<span class="sd"> respectively.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> A DataFrame of the two merged objects.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df1 = ps.DataFrame({&#39;lkey&#39;: [&#39;foo&#39;, &#39;bar&#39;, &#39;baz&#39;, &#39;foo&#39;],</span>
<span class="sd"> ... &#39;value&#39;: [1, 2, 3, 5]},</span>
<span class="sd"> ... columns=[&#39;lkey&#39;, &#39;value&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = ps.DataFrame({&#39;rkey&#39;: [&#39;foo&#39;, &#39;bar&#39;, &#39;baz&#39;, &#39;foo&#39;],</span>
<span class="sd"> ... &#39;value&#39;: [5, 6, 7, 8]},</span>
<span class="sd"> ... columns=[&#39;rkey&#39;, &#39;value&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df1</span>
<span class="sd"> lkey value</span>
<span class="sd"> 0 foo 1</span>
<span class="sd"> 1 bar 2</span>
<span class="sd"> 2 baz 3</span>
<span class="sd"> 3 foo 5</span>
<span class="sd"> &gt;&gt;&gt; df2</span>
<span class="sd"> rkey value</span>
<span class="sd"> 0 foo 5</span>
<span class="sd"> 1 bar 6</span>
<span class="sd"> 2 baz 7</span>
<span class="sd"> 3 foo 8</span>
<span class="sd"> Merge df1 and df2 on the lkey and rkey columns. The value columns have</span>
<span class="sd"> the default suffixes, _x and _y, appended.</span>
<span class="sd"> &gt;&gt;&gt; merged = ps.merge(df1, df2, left_on=&#39;lkey&#39;, right_on=&#39;rkey&#39;)</span>
<span class="sd"> &gt;&gt;&gt; merged.sort_values(by=[&#39;lkey&#39;, &#39;value_x&#39;, &#39;rkey&#39;, &#39;value_y&#39;]) # doctest: +ELLIPSIS</span>
<span class="sd"> lkey value_x rkey value_y</span>
<span class="sd"> ...bar 2 bar 6</span>
<span class="sd"> ...baz 3 baz 7</span>
<span class="sd"> ...foo 1 foo 5</span>
<span class="sd"> ...foo 1 foo 8</span>
<span class="sd"> ...foo 5 foo 5</span>
<span class="sd"> ...foo 5 foo 8</span>
<span class="sd"> &gt;&gt;&gt; left_psdf = ps.DataFrame({&#39;A&#39;: [1, 2]})</span>
<span class="sd"> &gt;&gt;&gt; right_psdf = ps.DataFrame({&#39;B&#39;: [&#39;x&#39;, &#39;y&#39;]}, index=[1, 2])</span>
<span class="sd"> &gt;&gt;&gt; ps.merge(left_psdf, right_psdf, left_index=True, right_index=True).sort_index()</span>
<span class="sd"> A B</span>
<span class="sd"> 1 2 x</span>
<span class="sd"> &gt;&gt;&gt; ps.merge(left_psdf, right_psdf, left_index=True, right_index=True, how=&#39;left&#39;).sort_index()</span>
<span class="sd"> A B</span>
<span class="sd"> 0 1 None</span>
<span class="sd"> 1 2 x</span>
<span class="sd"> &gt;&gt;&gt; ps.merge(left_psdf, right_psdf, left_index=True, right_index=True, how=&#39;right&#39;).sort_index()</span>
<span class="sd"> A B</span>
<span class="sd"> 1 2.0 x</span>
<span class="sd"> 2 NaN y</span>
<span class="sd"> &gt;&gt;&gt; ps.merge(left_psdf, right_psdf, left_index=True, right_index=True, how=&#39;outer&#39;).sort_index()</span>
<span class="sd"> A B</span>
<span class="sd"> 0 1.0 None</span>
<span class="sd"> 1 2.0 x</span>
<span class="sd"> 2 NaN y</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> As described in #263, joining string columns currently returns None for missing values</span>
<span class="sd"> instead of NaN.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">obj</span><span class="o">.</span><span class="n">merge</span><span class="p">(</span>
<span class="n">right</span><span class="p">,</span>
<span class="n">how</span><span class="o">=</span><span class="n">how</span><span class="p">,</span>
<span class="n">on</span><span class="o">=</span><span class="n">on</span><span class="p">,</span>
<span class="n">left_on</span><span class="o">=</span><span class="n">left_on</span><span class="p">,</span>
<span class="n">right_on</span><span class="o">=</span><span class="n">right_on</span><span class="p">,</span>
<span class="n">left_index</span><span class="o">=</span><span class="n">left_index</span><span class="p">,</span>
<span class="n">right_index</span><span class="o">=</span><span class="n">right_index</span><span class="p">,</span>
<span class="n">suffixes</span><span class="o">=</span><span class="n">suffixes</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="merge_asof"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.merge_asof.html#pyspark.pandas.merge_asof">[docs]</a><span class="k">def</span> <span class="nf">merge_asof</span><span class="p">(</span>
<span class="n">left</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">],</span>
<span class="n">right</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">],</span>
<span class="n">on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">left_on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">right_on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Name</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">left_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">right_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">by</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">left_by</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">right_by</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">suffixes</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span><span class="s2">&quot;_x&quot;</span><span class="p">,</span> <span class="s2">&quot;_y&quot;</span><span class="p">),</span>
<span class="n">tolerance</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">allow_exact_matches</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">direction</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;backward&quot;</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Perform an asof merge.</span>
<span class="sd"> This is similar to a left-join except that we match on nearest</span>
<span class="sd"> key rather than equal keys.</span>
<span class="sd"> For each row in the left DataFrame:</span>
<span class="sd"> - A &quot;backward&quot; search selects the last row in the right DataFrame whose</span>
<span class="sd"> &#39;on&#39; key is less than or equal to the left&#39;s key.</span>
<span class="sd"> - A &quot;forward&quot; search selects the first row in the right DataFrame whose</span>
<span class="sd"> &#39;on&#39; key is greater than or equal to the left&#39;s key.</span>
<span class="sd"> - A &quot;nearest&quot; search selects the row in the right DataFrame whose &#39;on&#39;</span>
<span class="sd"> key is closest in absolute distance to the left&#39;s key.</span>
<span class="sd"> Optionally match on equivalent keys with &#39;by&#39; before searching with &#39;on&#39;.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> left : DataFrame or named Series</span>
<span class="sd"> right : DataFrame or named Series</span>
<span class="sd"> on : label</span>
<span class="sd"> Field name to join on. Must be found in both DataFrames.</span>
<span class="sd"> The data MUST be ordered. Furthermore this must be a numeric column,</span>
<span class="sd"> such as datetimelike, integer, or float. On or left_on/right_on</span>
<span class="sd"> must be given.</span>
<span class="sd"> left_on : label</span>
<span class="sd"> Field name to join on in left DataFrame.</span>
<span class="sd"> right_on : label</span>
<span class="sd"> Field name to join on in right DataFrame.</span>
<span class="sd"> left_index : bool</span>
<span class="sd"> Use the index of the left DataFrame as the join key.</span>
<span class="sd"> right_index : bool</span>
<span class="sd"> Use the index of the right DataFrame as the join key.</span>
<span class="sd"> by : column name or list of column names</span>
<span class="sd"> Match on these columns before performing merge operation.</span>
<span class="sd"> left_by : column name</span>
<span class="sd"> Field names to match on in the left DataFrame.</span>
<span class="sd"> right_by : column name</span>
<span class="sd"> Field names to match on in the right DataFrame.</span>
<span class="sd"> suffixes : 2-length sequence (tuple, list, ...)</span>
<span class="sd"> Suffix to apply to overlapping column names in the left and right</span>
<span class="sd"> side, respectively.</span>
<span class="sd"> tolerance : int or Timedelta, optional, default None</span>
<span class="sd"> Select asof tolerance within this range; must be compatible</span>
<span class="sd"> with the merge index.</span>
<span class="sd"> allow_exact_matches : bool, default True</span>
<span class="sd"> - If True, allow matching with the same &#39;on&#39; value</span>
<span class="sd"> (i.e. less-than-or-equal-to / greater-than-or-equal-to)</span>
<span class="sd"> - If False, don&#39;t match the same &#39;on&#39; value</span>
<span class="sd"> (i.e., strictly less-than / strictly greater-than).</span>
<span class="sd"> direction : &#39;backward&#39; (default), &#39;forward&#39;, or &#39;nearest&#39;</span>
<span class="sd"> Whether to search for prior, subsequent, or closest matches.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> merged : DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> merge : Merge with a database-style join.</span>
<span class="sd"> merge_ordered : Merge with optional filling/interpolation.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; left = ps.DataFrame({&quot;a&quot;: [1, 5, 10], &quot;left_val&quot;: [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; left</span>
<span class="sd"> a left_val</span>
<span class="sd"> 0 1 a</span>
<span class="sd"> 1 5 b</span>
<span class="sd"> 2 10 c</span>
<span class="sd"> &gt;&gt;&gt; right = ps.DataFrame({&quot;a&quot;: [1, 2, 3, 6, 7], &quot;right_val&quot;: [1, 2, 3, 6, 7]})</span>
<span class="sd"> &gt;&gt;&gt; right</span>
<span class="sd"> a right_val</span>
<span class="sd"> 0 1 1</span>
<span class="sd"> 1 2 2</span>
<span class="sd"> 2 3 3</span>
<span class="sd"> 3 6 6</span>
<span class="sd"> 4 7 7</span>
<span class="sd"> &gt;&gt;&gt; ps.merge_asof(left, right, on=&quot;a&quot;).sort_values(&quot;a&quot;).reset_index(drop=True)</span>
<span class="sd"> a left_val right_val</span>
<span class="sd"> 0 1 a 1</span>
<span class="sd"> 1 5 b 3</span>
<span class="sd"> 2 10 c 7</span>
<span class="sd"> &gt;&gt;&gt; ps.merge_asof(</span>
<span class="sd"> ... left,</span>
<span class="sd"> ... right,</span>
<span class="sd"> ... on=&quot;a&quot;,</span>
<span class="sd"> ... allow_exact_matches=False</span>
<span class="sd"> ... ).sort_values(&quot;a&quot;).reset_index(drop=True)</span>
<span class="sd"> a left_val right_val</span>
<span class="sd"> 0 1 a NaN</span>
<span class="sd"> 1 5 b 3.0</span>
<span class="sd"> 2 10 c 7.0</span>
<span class="sd"> &gt;&gt;&gt; ps.merge_asof(</span>
<span class="sd"> ... left,</span>
<span class="sd"> ... right,</span>
<span class="sd"> ... on=&quot;a&quot;,</span>
<span class="sd"> ... direction=&quot;forward&quot;</span>
<span class="sd"> ... ).sort_values(&quot;a&quot;).reset_index(drop=True)</span>
<span class="sd"> a left_val right_val</span>
<span class="sd"> 0 1 a 1.0</span>
<span class="sd"> 1 5 b 6.0</span>
<span class="sd"> 2 10 c NaN</span>
<span class="sd"> &gt;&gt;&gt; ps.merge_asof(</span>
<span class="sd"> ... left,</span>
<span class="sd"> ... right,</span>
<span class="sd"> ... on=&quot;a&quot;,</span>
<span class="sd"> ... direction=&quot;nearest&quot;</span>
<span class="sd"> ... ).sort_values(&quot;a&quot;).reset_index(drop=True)</span>
<span class="sd"> a left_val right_val</span>
<span class="sd"> 0 1 a 1</span>
<span class="sd"> 1 5 b 6</span>
<span class="sd"> 2 10 c 7</span>
<span class="sd"> We can use indexed DataFrames as well.</span>
<span class="sd"> &gt;&gt;&gt; left = ps.DataFrame({&quot;left_val&quot;: [&quot;a&quot;, &quot;b&quot;, &quot;c&quot;]}, index=[1, 5, 10])</span>
<span class="sd"> &gt;&gt;&gt; left</span>
<span class="sd"> left_val</span>
<span class="sd"> 1 a</span>
<span class="sd"> 5 b</span>
<span class="sd"> 10 c</span>
<span class="sd"> &gt;&gt;&gt; right = ps.DataFrame({&quot;right_val&quot;: [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7])</span>
<span class="sd"> &gt;&gt;&gt; right</span>
<span class="sd"> right_val</span>
<span class="sd"> 1 1</span>
<span class="sd"> 2 2</span>
<span class="sd"> 3 3</span>
<span class="sd"> 6 6</span>
<span class="sd"> 7 7</span>
<span class="sd"> &gt;&gt;&gt; ps.merge_asof(left, right, left_index=True, right_index=True).sort_index()</span>
<span class="sd"> left_val right_val</span>
<span class="sd"> 1 a 1</span>
<span class="sd"> 5 b 3</span>
<span class="sd"> 10 c 7</span>
<span class="sd"> Here is a real-world times-series example</span>
<span class="sd"> &gt;&gt;&gt; quotes = ps.DataFrame(</span>
<span class="sd"> ... {</span>
<span class="sd"> ... &quot;time&quot;: [</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.023&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.023&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.030&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.041&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.048&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.049&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.072&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.075&quot;)</span>
<span class="sd"> ... ],</span>
<span class="sd"> ... &quot;ticker&quot;: [</span>
<span class="sd"> ... &quot;GOOG&quot;,</span>
<span class="sd"> ... &quot;MSFT&quot;,</span>
<span class="sd"> ... &quot;MSFT&quot;,</span>
<span class="sd"> ... &quot;MSFT&quot;,</span>
<span class="sd"> ... &quot;GOOG&quot;,</span>
<span class="sd"> ... &quot;AAPL&quot;,</span>
<span class="sd"> ... &quot;GOOG&quot;,</span>
<span class="sd"> ... &quot;MSFT&quot;</span>
<span class="sd"> ... ],</span>
<span class="sd"> ... &quot;bid&quot;: [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],</span>
<span class="sd"> ... &quot;ask&quot;: [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]</span>
<span class="sd"> ... }</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; quotes</span>
<span class="sd"> time ticker bid ask</span>
<span class="sd"> 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93</span>
<span class="sd"> 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96</span>
<span class="sd"> 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98</span>
<span class="sd"> 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00</span>
<span class="sd"> 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93</span>
<span class="sd"> 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01</span>
<span class="sd"> 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88</span>
<span class="sd"> 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03</span>
<span class="sd"> &gt;&gt;&gt; trades = ps.DataFrame(</span>
<span class="sd"> ... {</span>
<span class="sd"> ... &quot;time&quot;: [</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.023&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.038&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.048&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.048&quot;),</span>
<span class="sd"> ... pd.Timestamp(&quot;2016-05-25 13:30:00.048&quot;)</span>
<span class="sd"> ... ],</span>
<span class="sd"> ... &quot;ticker&quot;: [&quot;MSFT&quot;, &quot;MSFT&quot;, &quot;GOOG&quot;, &quot;GOOG&quot;, &quot;AAPL&quot;],</span>
<span class="sd"> ... &quot;price&quot;: [51.95, 51.95, 720.77, 720.92, 98.0],</span>
<span class="sd"> ... &quot;quantity&quot;: [75, 155, 100, 100, 100]</span>
<span class="sd"> ... }</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; trades</span>
<span class="sd"> time ticker price quantity</span>
<span class="sd"> 0 2016-05-25 13:30:00.023 MSFT 51.95 75</span>
<span class="sd"> 1 2016-05-25 13:30:00.038 MSFT 51.95 155</span>
<span class="sd"> 2 2016-05-25 13:30:00.048 GOOG 720.77 100</span>
<span class="sd"> 3 2016-05-25 13:30:00.048 GOOG 720.92 100</span>
<span class="sd"> 4 2016-05-25 13:30:00.048 AAPL 98.00 100</span>
<span class="sd"> By default we are taking the asof of the quotes</span>
<span class="sd"> &gt;&gt;&gt; ps.merge_asof(</span>
<span class="sd"> ... trades, quotes, on=&quot;time&quot;, by=&quot;ticker&quot;</span>
<span class="sd"> ... ).sort_values([&quot;time&quot;, &quot;ticker&quot;, &quot;price&quot;]).reset_index(drop=True)</span>
<span class="sd"> time ticker price quantity bid ask</span>
<span class="sd"> 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96</span>
<span class="sd"> 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98</span>
<span class="sd"> 2 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN</span>
<span class="sd"> 3 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93</span>
<span class="sd"> 4 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93</span>
<span class="sd"> We only asof within 2ms between the quote time and the trade time</span>
<span class="sd"> &gt;&gt;&gt; ps.merge_asof(</span>
<span class="sd"> ... trades,</span>
<span class="sd"> ... quotes,</span>
<span class="sd"> ... on=&quot;time&quot;,</span>
<span class="sd"> ... by=&quot;ticker&quot;,</span>
<span class="sd"> ... tolerance=F.expr(&quot;INTERVAL 2 MILLISECONDS&quot;) # pd.Timedelta(&quot;2ms&quot;)</span>
<span class="sd"> ... ).sort_values([&quot;time&quot;, &quot;ticker&quot;, &quot;price&quot;]).reset_index(drop=True)</span>
<span class="sd"> time ticker price quantity bid ask</span>
<span class="sd"> 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96</span>
<span class="sd"> 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN</span>
<span class="sd"> 2 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN</span>
<span class="sd"> 3 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93</span>
<span class="sd"> 4 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93</span>
<span class="sd"> We only asof within 10ms between the quote time and the trade time</span>
<span class="sd"> and we exclude exact matches on time. However *prior* data will</span>
<span class="sd"> propagate forward</span>
<span class="sd"> &gt;&gt;&gt; ps.merge_asof(</span>
<span class="sd"> ... trades,</span>
<span class="sd"> ... quotes,</span>
<span class="sd"> ... on=&quot;time&quot;,</span>
<span class="sd"> ... by=&quot;ticker&quot;,</span>
<span class="sd"> ... tolerance=F.expr(&quot;INTERVAL 10 MILLISECONDS&quot;), # pd.Timedelta(&quot;10ms&quot;)</span>
<span class="sd"> ... allow_exact_matches=False</span>
<span class="sd"> ... ).sort_values([&quot;time&quot;, &quot;ticker&quot;, &quot;price&quot;]).reset_index(drop=True)</span>
<span class="sd"> time ticker price quantity bid ask</span>
<span class="sd"> 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN</span>
<span class="sd"> 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98</span>
<span class="sd"> 2 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN</span>
<span class="sd"> 3 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN</span>
<span class="sd"> 4 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">to_list</span><span class="p">(</span><span class="n">os</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]])</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]:</span>
<span class="k">if</span> <span class="n">os</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="p">[]</span>
<span class="k">elif</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">os</span><span class="p">):</span>
<span class="k">return</span> <span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="n">os</span><span class="p">)]</span>
<span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">os</span><span class="p">):</span>
<span class="k">return</span> <span class="p">[(</span><span class="n">os</span><span class="p">,)]</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="p">[</span><span class="n">o</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">o</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">o</span><span class="p">,)</span> <span class="k">for</span> <span class="n">o</span> <span class="ow">in</span> <span class="n">os</span><span class="p">]</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">left</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="n">left</span> <span class="o">=</span> <span class="n">left</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">right</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="n">right</span> <span class="o">=</span> <span class="n">right</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span>
<span class="k">if</span> <span class="n">on</span><span class="p">:</span>
<span class="k">if</span> <span class="n">left_on</span> <span class="ow">or</span> <span class="n">right_on</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s1">&#39;Can only pass argument &quot;on&quot; OR &quot;left_on&quot; and &quot;right_on&quot;, &#39;</span>
<span class="s2">&quot;not a combination of both.&quot;</span>
<span class="p">)</span>
<span class="n">left_as_of_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">on</span><span class="p">)))</span>
<span class="n">right_as_of_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">on</span><span class="p">)))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="n">left_index</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">MultiIndex</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;left can only have one index&quot;</span><span class="p">)</span>
<span class="n">left_as_of_names</span> <span class="o">=</span> <span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">left_as_of_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">left_on</span><span class="p">)))</span>
<span class="k">if</span> <span class="n">right_index</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="n">MultiIndex</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;right can only have one index&quot;</span><span class="p">)</span>
<span class="n">right_as_of_names</span> <span class="o">=</span> <span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">right_as_of_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">right_on</span><span class="p">)))</span>
<span class="k">if</span> <span class="n">left_as_of_names</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">right_as_of_names</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Must pass right_on or right_index=True&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">right_as_of_names</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">left_as_of_names</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Must pass left_on or left_index=True&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">left_as_of_names</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">right_as_of_names</span><span class="p">:</span>
<span class="n">common</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">intersection</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">columns</span><span class="p">))</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">common</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;No common columns to perform merge on. Merge options: &quot;</span>
<span class="s2">&quot;left_on=None, right_on=None, left_index=False, right_index=False&quot;</span>
<span class="p">)</span>
<span class="n">left_as_of_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">common</span><span class="p">)))</span>
<span class="n">right_as_of_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">common</span><span class="p">)))</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">left_as_of_names</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;can only asof on a key for left&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">right_as_of_names</span><span class="p">)</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;can only asof on a key for right&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">by</span><span class="p">:</span>
<span class="k">if</span> <span class="n">left_by</span> <span class="ow">or</span> <span class="n">right_by</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s1">&#39;Can only pass argument &quot;on&quot; OR &quot;left_by&quot; and &quot;right_by&quot;.&#39;</span><span class="p">)</span>
<span class="n">left_join_on_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">by</span><span class="p">)))</span>
<span class="n">right_join_on_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">by</span><span class="p">)))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">left_join_on_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">left_by</span><span class="p">)))</span>
<span class="n">right_join_on_names</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">map</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">,</span> <span class="n">to_list</span><span class="p">(</span><span class="n">right_by</span><span class="p">)))</span>
<span class="k">if</span> <span class="n">left_join_on_names</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">right_join_on_names</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;missing right_by&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">right_join_on_names</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">left_join_on_names</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;missing left_by&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">left_join_on_names</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">right_join_on_names</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;left_by and right_by must be same length&quot;</span><span class="p">)</span>
<span class="c1"># We should distinguish the name to avoid ambiguous column name after merging.</span>
<span class="n">right_prefix</span> <span class="o">=</span> <span class="s2">&quot;__right_&quot;</span>
<span class="n">right_as_of_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">right_prefix</span> <span class="o">+</span> <span class="n">right_as_of_name</span> <span class="k">for</span> <span class="n">right_as_of_name</span> <span class="ow">in</span> <span class="n">right_as_of_names</span><span class="p">]</span>
<span class="n">right_join_on_names</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">right_prefix</span> <span class="o">+</span> <span class="n">right_join_on_name</span> <span class="k">for</span> <span class="n">right_join_on_name</span> <span class="ow">in</span> <span class="n">right_join_on_names</span>
<span class="p">]</span>
<span class="n">left_as_of_name</span> <span class="o">=</span> <span class="n">left_as_of_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">right_as_of_name</span> <span class="o">=</span> <span class="n">right_as_of_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">def</span> <span class="nf">resolve</span><span class="p">(</span><span class="n">internal</span><span class="p">:</span> <span class="n">InternalFrame</span><span class="p">,</span> <span class="n">side</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">InternalFrame</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">rename</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="k">return</span> <span class="s2">&quot;__</span><span class="si">{}</span><span class="s2">_</span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">side</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">resolved_copy</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="o">*</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">rename</span><span class="p">(</span><span class="n">col</span><span class="p">))</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">sdf</span><span class="o">.</span><span class="n">columns</span>
<span class="k">if</span> <span class="n">col</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">HIDDEN_COLUMNS</span>
<span class="p">],</span>
<span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">rename</span><span class="p">(</span><span class="n">col</span><span class="p">))</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_column_names</span>
<span class="p">],</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">[</span><span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">rename</span><span class="p">(</span><span class="n">field</span><span class="o">.</span><span class="n">name</span><span class="p">))</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">rename</span><span class="p">(</span><span class="n">col</span><span class="p">))</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_spark_column_names</span>
<span class="p">],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">rename</span><span class="p">(</span><span class="n">field</span><span class="o">.</span><span class="n">name</span><span class="p">))</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">],</span>
<span class="p">)</span>
<span class="n">left_internal</span> <span class="o">=</span> <span class="n">left</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span>
<span class="n">right_internal</span> <span class="o">=</span> <span class="n">resolve</span><span class="p">(</span><span class="n">right</span><span class="o">.</span><span class="n">_internal</span><span class="p">,</span> <span class="s2">&quot;right&quot;</span><span class="p">)</span>
<span class="n">left_table</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;left_table&quot;</span><span class="p">)</span>
<span class="n">right_table</span> <span class="o">=</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;right_table&quot;</span><span class="p">)</span>
<span class="n">left_as_of_column</span> <span class="o">=</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">left_table</span><span class="p">,</span> <span class="n">left_as_of_name</span><span class="p">)</span>
<span class="n">right_as_of_column</span> <span class="o">=</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">right_table</span><span class="p">,</span> <span class="n">right_as_of_name</span><span class="p">)</span>
<span class="k">if</span> <span class="n">left_join_on_names</span><span class="p">:</span>
<span class="n">left_join_on_columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">left_table</span><span class="p">,</span> <span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">left_join_on_names</span><span class="p">]</span>
<span class="n">right_join_on_columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">right_table</span><span class="p">,</span> <span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">right_join_on_names</span><span class="p">]</span>
<span class="n">on</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">lft</span><span class="p">,</span> <span class="n">rgt</span><span class="p">:</span> <span class="n">lft</span> <span class="o">&amp;</span> <span class="n">rgt</span><span class="p">,</span>
<span class="p">[</span><span class="n">lft</span> <span class="o">==</span> <span class="n">rgt</span> <span class="k">for</span> <span class="n">lft</span><span class="p">,</span> <span class="n">rgt</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">left_join_on_columns</span><span class="p">,</span> <span class="n">right_join_on_columns</span><span class="p">)],</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">on</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">if</span> <span class="n">tolerance</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">tolerance</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span>
<span class="n">tolerance</span> <span class="o">=</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">tolerance</span><span class="p">)</span>
<span class="n">as_of_joined_table</span> <span class="o">=</span> <span class="n">left_table</span><span class="o">.</span><span class="n">_joinAsOf</span><span class="p">(</span>
<span class="n">right_table</span><span class="p">,</span>
<span class="n">leftAsOfColumn</span><span class="o">=</span><span class="n">left_as_of_column</span><span class="p">,</span>
<span class="n">rightAsOfColumn</span><span class="o">=</span><span class="n">right_as_of_column</span><span class="p">,</span>
<span class="n">on</span><span class="o">=</span><span class="n">on</span><span class="p">,</span>
<span class="n">how</span><span class="o">=</span><span class="s2">&quot;left&quot;</span><span class="p">,</span>
<span class="n">tolerance</span><span class="o">=</span><span class="n">tolerance</span><span class="p">,</span>
<span class="n">allowExactMatches</span><span class="o">=</span><span class="n">allow_exact_matches</span><span class="p">,</span>
<span class="n">direction</span><span class="o">=</span><span class="n">direction</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># Unpack suffixes tuple for convenience</span>
<span class="n">left_suffix</span> <span class="o">=</span> <span class="n">suffixes</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">right_suffix</span> <span class="o">=</span> <span class="n">suffixes</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
<span class="c1"># Append suffixes to columns with the same name to avoid conflicts later</span>
<span class="n">duplicate_columns</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">left_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">&amp;</span> <span class="nb">set</span><span class="p">(</span><span class="n">right_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span>
<span class="n">exprs</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">data_columns</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">def</span> <span class="nf">left_scol_for</span><span class="p">(</span><span class="n">label</span><span class="p">:</span> <span class="n">Label</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="k">return</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">as_of_joined_table</span><span class="p">,</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">right_scol_for</span><span class="p">(</span><span class="n">label</span><span class="p">:</span> <span class="n">Label</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="k">return</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">as_of_joined_table</span><span class="p">,</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">left_scol_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">duplicate_columns</span><span class="p">:</span>
<span class="n">spark_column_name</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">if</span> <span class="n">spark_column_name</span> <span class="ow">in</span> <span class="p">(</span><span class="n">left_as_of_names</span> <span class="o">+</span> <span class="n">left_join_on_names</span><span class="p">)</span> <span class="ow">and</span> <span class="p">(</span>
<span class="p">(</span><span class="n">right_prefix</span> <span class="o">+</span> <span class="n">spark_column_name</span><span class="p">)</span> <span class="ow">in</span> <span class="p">(</span><span class="n">right_as_of_names</span> <span class="o">+</span> <span class="n">right_join_on_names</span><span class="p">)</span>
<span class="p">):</span>
<span class="k">pass</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">col</span> <span class="o">+</span> <span class="n">left_suffix</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">col</span><span class="p">)</span>
<span class="n">label</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">([</span><span class="nb">str</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="o">+</span> <span class="n">left_suffix</span><span class="p">]</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="mi">1</span><span class="p">:]))</span>
<span class="n">exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span>
<span class="n">data_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col</span><span class="p">)</span>
<span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span>
<span class="c1"># recover `right_prefix` here.</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">right_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)[</span><span class="nb">len</span><span class="p">(</span><span class="n">right_prefix</span><span class="p">)</span> <span class="p">:]</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">right_scol_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">col</span><span class="p">)</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">duplicate_columns</span><span class="p">:</span>
<span class="n">spark_column_name</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">if</span> <span class="n">spark_column_name</span> <span class="ow">in</span> <span class="n">left_as_of_names</span> <span class="o">+</span> <span class="n">left_join_on_names</span> <span class="ow">and</span> <span class="p">(</span>
<span class="p">(</span><span class="n">right_prefix</span> <span class="o">+</span> <span class="n">spark_column_name</span><span class="p">)</span> <span class="ow">in</span> <span class="n">right_as_of_names</span> <span class="o">+</span> <span class="n">right_join_on_names</span>
<span class="p">):</span>
<span class="k">continue</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">col</span> <span class="o">+</span> <span class="n">right_suffix</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">col</span><span class="p">)</span>
<span class="n">label</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">([</span><span class="nb">str</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="o">+</span> <span class="n">right_suffix</span><span class="p">]</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">[</span><span class="mi">1</span><span class="p">:]))</span>
<span class="n">exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span>
<span class="n">data_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col</span><span class="p">)</span>
<span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="c1"># Retain indices if they are used for joining</span>
<span class="k">if</span> <span class="n">left_index</span> <span class="ow">or</span> <span class="n">right_index</span><span class="p">:</span>
<span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">left_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span><span class="p">))</span>
<span class="p">]</span>
<span class="n">left_index_scols</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">scol</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">left_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_spark_column_names</span><span class="p">)</span>
<span class="p">]</span>
<span class="n">exprs</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">left_index_scols</span><span class="p">)</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="n">left_internal</span><span class="o">.</span><span class="n">index_names</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">selected_columns</span> <span class="o">=</span> <span class="n">as_of_joined_table</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">exprs</span><span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">selected_columns</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">selected_columns</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_spark_column_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span>
<span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">selected_columns</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">],</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div>
<div class="viewcode-block" id="to_numeric"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.to_numeric.html#pyspark.pandas.to_numeric">[docs]</a><span class="nd">@no_type_check</span>
<span class="k">def</span> <span class="nf">to_numeric</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">errors</span><span class="o">=</span><span class="s2">&quot;raise&quot;</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert argument to a numeric type.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> arg : scalar, list, tuple, 1-d array, or Series</span>
<span class="sd"> Argument to be converted.</span>
<span class="sd"> errors : {&#39;raise&#39;, &#39;coerce&#39;}, default &#39;raise&#39;</span>
<span class="sd"> * If &#39;coerce&#39;, then invalid parsing will be set as NaN.</span>
<span class="sd"> * If &#39;raise&#39;, then invalid parsing will raise an exception.</span>
<span class="sd"> * If &#39;ignore&#39;, then invalid parsing will return the input.</span>
<span class="sd"> .. note:: &#39;ignore&#39; doesn&#39;t work yet when `arg` is pandas-on-Spark Series.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> ret : numeric if parsing succeeded.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.astype : Cast argument to a specified dtype.</span>
<span class="sd"> to_datetime : Convert argument to datetime.</span>
<span class="sd"> to_timedelta : Convert argument to timedelta.</span>
<span class="sd"> numpy.ndarray.astype : Cast a numpy array to a specified type.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; psser = ps.Series([&#39;1.0&#39;, &#39;2&#39;, &#39;-3&#39;])</span>
<span class="sd"> &gt;&gt;&gt; psser</span>
<span class="sd"> 0 1.0</span>
<span class="sd"> 1 2</span>
<span class="sd"> 2 -3</span>
<span class="sd"> dtype: object</span>
<span class="sd"> &gt;&gt;&gt; ps.to_numeric(psser)</span>
<span class="sd"> 0 1.0</span>
<span class="sd"> 1 2.0</span>
<span class="sd"> 2 -3.0</span>
<span class="sd"> dtype: float32</span>
<span class="sd"> If given Series contains invalid value to cast float, just cast it to `np.nan`</span>
<span class="sd"> when `errors` is set to &quot;coerce&quot;.</span>
<span class="sd"> &gt;&gt;&gt; psser = ps.Series([&#39;apple&#39;, &#39;1.0&#39;, &#39;2&#39;, &#39;-3&#39;])</span>
<span class="sd"> &gt;&gt;&gt; psser</span>
<span class="sd"> 0 apple</span>
<span class="sd"> 1 1.0</span>
<span class="sd"> 2 2</span>
<span class="sd"> 3 -3</span>
<span class="sd"> dtype: object</span>
<span class="sd"> &gt;&gt;&gt; ps.to_numeric(psser, errors=&quot;coerce&quot;)</span>
<span class="sd"> 0 NaN</span>
<span class="sd"> 1 1.0</span>
<span class="sd"> 2 2.0</span>
<span class="sd"> 3 -3.0</span>
<span class="sd"> dtype: float32</span>
<span class="sd"> Also support for list, tuple, np.array, or a scalar</span>
<span class="sd"> &gt;&gt;&gt; ps.to_numeric([&#39;1.0&#39;, &#39;2&#39;, &#39;-3&#39;])</span>
<span class="sd"> array([ 1., 2., -3.])</span>
<span class="sd"> &gt;&gt;&gt; ps.to_numeric((&#39;1.0&#39;, &#39;2&#39;, &#39;-3&#39;))</span>
<span class="sd"> array([ 1., 2., -3.])</span>
<span class="sd"> &gt;&gt;&gt; ps.to_numeric(np.array([&#39;1.0&#39;, &#39;2&#39;, &#39;-3&#39;]))</span>
<span class="sd"> array([ 1., 2., -3.])</span>
<span class="sd"> &gt;&gt;&gt; ps.to_numeric(&#39;1.0&#39;)</span>
<span class="sd"> 1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="k">if</span> <span class="n">errors</span> <span class="o">==</span> <span class="s2">&quot;coerce&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="n">arg</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">arg</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">&quot;float&quot;</span><span class="p">))</span>
<span class="k">elif</span> <span class="n">errors</span> <span class="o">==</span> <span class="s2">&quot;raise&quot;</span><span class="p">:</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">arg</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="n">scol_casted</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">&quot;float&quot;</span><span class="p">)</span>
<span class="n">cond</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">assert_true</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">isNull</span><span class="p">()</span> <span class="o">|</span> <span class="n">scol_casted</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">())</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="n">scol_casted</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">arg</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">errors</span> <span class="o">==</span> <span class="s2">&quot;ignore&quot;</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;&#39;ignore&#39; is not implemented yet, when the `arg` is Series.&quot;</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;invalid error value specified&quot;</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">to_numeric</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">errors</span><span class="o">=</span><span class="n">errors</span><span class="p">)</span></div>
<div class="viewcode-block" id="broadcast"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.broadcast.html#pyspark.pandas.broadcast">[docs]</a><span class="k">def</span> <span class="nf">broadcast</span><span class="p">(</span><span class="n">obj</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Marks a DataFrame as small enough for use in broadcast joins.</span>
<span class="sd"> .. deprecated:: 3.2.0</span>
<span class="sd"> Use :func:`DataFrame.spark.hint` instead.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> obj : DataFrame</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> ret : DataFrame with broadcast hint.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.merge : Merge DataFrame objects with a database-style join.</span>
<span class="sd"> DataFrame.join : Join columns of another DataFrame.</span>
<span class="sd"> DataFrame.update : Modify in place using non-NA values from another DataFrame.</span>
<span class="sd"> DataFrame.hint : Specifies some hint on the current DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df1 = ps.DataFrame({&#39;lkey&#39;: [&#39;foo&#39;, &#39;bar&#39;, &#39;baz&#39;, &#39;foo&#39;],</span>
<span class="sd"> ... &#39;value&#39;: [1, 2, 3, 5]},</span>
<span class="sd"> ... columns=[&#39;lkey&#39;, &#39;value&#39;]).set_index(&#39;lkey&#39;)</span>
<span class="sd"> &gt;&gt;&gt; df2 = ps.DataFrame({&#39;rkey&#39;: [&#39;foo&#39;, &#39;bar&#39;, &#39;baz&#39;, &#39;foo&#39;],</span>
<span class="sd"> ... &#39;value&#39;: [5, 6, 7, 8]},</span>
<span class="sd"> ... columns=[&#39;rkey&#39;, &#39;value&#39;]).set_index(&#39;rkey&#39;)</span>
<span class="sd"> &gt;&gt;&gt; merged = df1.merge(ps.broadcast(df2), left_index=True, right_index=True)</span>
<span class="sd"> &gt;&gt;&gt; merged.spark.explain() # doctest: +ELLIPSIS</span>
<span class="sd"> == Physical Plan ==</span>
<span class="sd"> ...</span>
<span class="sd"> ...BroadcastHashJoin...</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;`broadcast` has been deprecated and might be removed in a future version. &quot;</span>
<span class="s2">&quot;Use `DataFrame.spark.hint` with &#39;broadcast&#39; for `name` parameter instead.&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;Invalid type : expected DataFrame got </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">obj</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">broadcast</span><span class="p">(</span><span class="n">obj</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="p">))</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="read_orc"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.read_orc.html#pyspark.pandas.read_orc">[docs]</a><span class="k">def</span> <span class="nf">read_orc</span><span class="p">(</span>
<span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Load an ORC object from the file path, returning a DataFrame.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> path : str</span>
<span class="sd"> The path string storing the ORC file to be read.</span>
<span class="sd"> columns : list, default None</span>
<span class="sd"> If not None, only these columns will be read from the file.</span>
<span class="sd"> index_col : str or list of str, optional, default: None</span>
<span class="sd"> Index column of table in Spark.</span>
<span class="sd"> options : dict</span>
<span class="sd"> All other options passed directly into Spark&#39;s data source.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.range(1).to_orc(&#39;%s/read_spark_io/data.orc&#39; % path)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_orc(&#39;%s/read_spark_io/data.orc&#39; % path, columns=[&#39;id&#39;])</span>
<span class="sd"> id</span>
<span class="sd"> 0 0</span>
<span class="sd"> You can preserve the index in the roundtrip as below.</span>
<span class="sd"> &gt;&gt;&gt; ps.range(1).to_orc(&#39;%s/read_spark_io/data.orc&#39; % path, index_col=&quot;index&quot;)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_orc(&#39;%s/read_spark_io/data.orc&#39; % path, columns=[&#39;id&#39;], index_col=&quot;index&quot;)</span>
<span class="sd"> ... # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> id</span>
<span class="sd"> index</span>
<span class="sd"> 0 0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">read_spark_io</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="nb">format</span><span class="o">=</span><span class="s2">&quot;orc&quot;</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">)</span>
<span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">psdf_columns</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span>
<span class="n">new_columns</span> <span class="o">=</span> <span class="nb">list</span><span class="p">()</span>
<span class="k">for</span> <span class="n">column</span> <span class="ow">in</span> <span class="nb">list</span><span class="p">(</span><span class="n">columns</span><span class="p">):</span>
<span class="k">if</span> <span class="n">column</span> <span class="ow">in</span> <span class="n">psdf_columns</span><span class="p">:</span>
<span class="n">new_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">column</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Unknown column name &#39;</span><span class="si">{}</span><span class="s2">&#39;&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">column</span><span class="p">))</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">new_columns</span><span class="p">]</span>
<span class="k">return</span> <span class="n">psdf</span></div>
<span class="k">def</span> <span class="nf">_get_index_map</span><span class="p">(</span>
<span class="n">sdf</span><span class="p">:</span> <span class="n">SparkDataFrame</span><span class="p">,</span> <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">]],</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]]]:</span>
<span class="n">index_spark_columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">]]</span>
<span class="n">index_names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]]</span>
<span class="k">if</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">index_col</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">index_col</span> <span class="o">=</span> <span class="p">[</span><span class="n">index_col</span><span class="p">]</span>
<span class="n">sdf_columns</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">)</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_col</span><span class="p">:</span>
<span class="k">if</span> <span class="n">col</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">sdf_columns</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">col</span><span class="p">)</span>
<span class="n">index_spark_columns</span> <span class="o">=</span> <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_col</span><span class="p">]</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="p">[(</span><span class="n">col</span><span class="p">,)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">index_col</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">index_spark_columns</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">return</span> <span class="n">index_spark_columns</span><span class="p">,</span> <span class="n">index_names</span>
<span class="n">_get_dummies_default_accept_types</span> <span class="o">=</span> <span class="p">(</span><span class="n">DecimalType</span><span class="p">,</span> <span class="n">StringType</span><span class="p">,</span> <span class="n">DateType</span><span class="p">)</span>
<span class="n">_get_dummies_acceptable_types</span> <span class="o">=</span> <span class="n">_get_dummies_default_accept_types</span> <span class="o">+</span> <span class="p">(</span>
<span class="n">ByteType</span><span class="p">,</span>
<span class="n">ShortType</span><span class="p">,</span>
<span class="n">IntegerType</span><span class="p">,</span>
<span class="n">LongType</span><span class="p">,</span>
<span class="n">FloatType</span><span class="p">,</span>
<span class="n">DoubleType</span><span class="p">,</span>
<span class="n">BooleanType</span><span class="p">,</span>
<span class="n">TimestampType</span><span class="p">,</span>
<span class="n">TimestampNTZType</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">doctest</span>
<span class="kn">import</span> <span class="nn">shutil</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">import</span> <span class="nn">tempfile</span>
<span class="kn">import</span> <span class="nn">uuid</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="kn">import</span> <span class="nn">pyspark.pandas.namespace</span>
<span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;SPARK_HOME&quot;</span><span class="p">])</span>
<span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">namespace</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;ps&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span>
<span class="n">spark</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">&quot;local[4]&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">&quot;pyspark.pandas.namespace tests&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="p">)</span>
<span class="n">db_name</span> <span class="o">=</span> <span class="s2">&quot;db</span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="nb">str</span><span class="p">(</span><span class="n">uuid</span><span class="o">.</span><span class="n">uuid4</span><span class="p">())</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s2">&quot;-&quot;</span><span class="p">,</span> <span class="s2">&quot;&quot;</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;CREATE DATABASE </span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">db_name</span><span class="p">)</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;db&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">db_name</span>
<span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;path&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">path</span>
<span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span>
<span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">namespace</span><span class="p">,</span>
<span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span>
<span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">shutil</span><span class="o">.</span><span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="n">ignore_errors</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="s2">&quot;DROP DATABASE IF EXISTS </span><span class="si">%s</span><span class="s2"> CASCADE&quot;</span> <span class="o">%</span> <span class="n">db_name</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span>
<span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
<span class="n">_test</span><span class="p">()</span>
</pre></div>
</div>
<div class='prev-next-bottom'>
</div>
</main>
</div>
</div>
<script src="../../../_static/js/index.3da636dd464baa7582d2.js"></script>
<footer class="footer mt-5 mt-md-0">
<div class="container">
<p>
&copy; Copyright .<br/>
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/>
</p>
</div>
</footer>
</body>
</html>