blob: c1f4a15da3c93f245f04f88d393238771e728b6b [file] [log] [blame]
<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>apache_beam.dataframe.io module &mdash; Apache Beam 2.36.0 documentation</title>
<script type="text/javascript" src="_static/js/modernizr.min.js"></script>
<script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
<script type="text/javascript" src="_static/jquery.js"></script>
<script type="text/javascript" src="_static/underscore.js"></script>
<script type="text/javascript" src="_static/doctools.js"></script>
<script type="text/javascript" src="_static/language_data.js"></script>
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/javascript" src="_static/js/theme.js"></script>
<link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
<link rel="stylesheet" href="_static/pygments.css" type="text/css" />
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="apache_beam.dataframe.pandas_top_level_functions module" href="apache_beam.dataframe.pandas_top_level_functions.html" />
<link rel="prev" title="apache_beam.dataframe.frames module" href="apache_beam.dataframe.frames.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="index.html" class="icon icon-home"> Apache Beam
</a>
<div class="version">
2.36.0
</div>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
<input type="text" name="q" placeholder="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="apache_beam.coders.html">apache_beam.coders package</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="apache_beam.dataframe.html">apache_beam.dataframe package</a><ul class="current">
<li class="toctree-l2 current"><a class="reference internal" href="apache_beam.dataframe.html#submodules">Submodules</a><ul class="current">
<li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.convert.html">apache_beam.dataframe.convert module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.doctests.html">apache_beam.dataframe.doctests module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.expressions.html">apache_beam.dataframe.expressions module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.frame_base.html">apache_beam.dataframe.frame_base module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.frames.html">apache_beam.dataframe.frames module</a></li>
<li class="toctree-l3 current"><a class="current reference internal" href="#">apache_beam.dataframe.io module</a><ul>
<li class="toctree-l4"><a class="reference internal" href="#sources">Sources</a></li>
<li class="toctree-l4"><a class="reference internal" href="#sinks">Sinks</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.pandas_top_level_functions.html">apache_beam.dataframe.pandas_top_level_functions module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.partitionings.html">apache_beam.dataframe.partitionings module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.schemas.html">apache_beam.dataframe.schemas module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.transforms.html">apache_beam.dataframe.transforms module</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.io.html">apache_beam.io package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.metrics.html">apache_beam.metrics package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.ml.html">apache_beam.ml package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.options.html">apache_beam.options package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.portability.html">apache_beam.portability package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.runners.html">apache_beam.runners package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.transforms.html">apache_beam.transforms package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.typehints.html">apache_beam.typehints package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.utils.html">apache_beam.utils package</a></li>
</ul>
<ul>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.error.html">apache_beam.error module</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.pipeline.html">apache_beam.pipeline module</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.pvalue.html">apache_beam.pvalue module</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" aria-label="top navigation">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="index.html">Apache Beam</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="index.html">Docs</a> &raquo;</li>
<li><a href="apache_beam.dataframe.html">apache_beam.dataframe package</a> &raquo;</li>
<li>apache_beam.dataframe.io module</li>
<li class="wy-breadcrumbs-aside">
<a href="_sources/apache_beam.dataframe.io.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<div class="section" id="module-apache_beam.dataframe.io">
<span id="apache-beam-dataframe-io-module"></span><h1>apache_beam.dataframe.io module<a class="headerlink" href="#module-apache_beam.dataframe.io" title="Permalink to this headline"></a></h1>
<p>Sources and sinks for the Beam DataFrame API.</p>
<div class="section" id="sources">
<h2>Sources<a class="headerlink" href="#sources" title="Permalink to this headline"></a></h2>
<p>This module provides analogs for pandas <code class="docutils literal notranslate"><span class="pre">read</span></code> methods, like
<a class="reference external" href="https://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.read_csv.html#pandas.read_csv" title="(in pandas v1.5.0.dev0+279.g7651c08230)"><code class="xref py py-func docutils literal notranslate"><span class="pre">pandas.read_csv()</span></code></a>. However Beam sources like <a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a>
create a Beam <code class="xref py py-class docutils literal notranslate"><span class="pre">PTransform</span></code>, and return a
<a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame"><code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code></a> or
<a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredSeries" title="apache_beam.dataframe.frames.DeferredSeries"><code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredSeries</span></code></a> representing the contents
of the referenced file(s) or data source.</p>
<p>The result of these methods must be applied to a <code class="xref py py-class docutils literal notranslate"><span class="pre">Pipeline</span></code>
object, for example:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">df</span> <span class="o">=</span> <span class="n">p</span> <span class="o">|</span> <span class="n">beam</span><span class="o">.</span><span class="n">dataframe</span><span class="o">.</span><span class="n">io</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="o">...</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="section" id="sinks">
<h2>Sinks<a class="headerlink" href="#sinks" title="Permalink to this headline"></a></h2>
<p>This module also defines analogs for pandas sink, or <code class="docutils literal notranslate"><span class="pre">to</span></code>, methods that
generate a Beam <code class="xref py py-class docutils literal notranslate"><span class="pre">PTransform</span></code>. Users should prefer calling
these operations from <a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame"><code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code></a>
instances (for example with
<a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame.to_csv" title="apache_beam.dataframe.frames.DeferredDataFrame.to_csv"><code class="xref py py-meth docutils literal notranslate"><span class="pre">DeferredDataFrame.to_csv</span></code></a>).</p>
<dl class="function">
<dt id="apache_beam.dataframe.io.read_csv">
<code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_csv</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>splittable=False</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#read_csv"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.read_csv" title="Permalink to this definition"></a></dt>
<dd><p>Read a comma-separated values (csv) file into DataFrame.</p>
<p>Also supports optionally iterating or breaking of the file
into chunks.</p>
<p>Additional help can be found in the online docs for
<a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html">IO Tools</a>.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>filepath_or_buffer</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>path object</em><em> or </em><em>file-like object</em>) – <p>Any valid string path is acceptable. The string could be a URL. Valid
URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
expected. A local file could be: <a class="reference external" href="file://localhost/path/to/table.csv">file://localhost/path/to/table.csv</a>.</p>
<p>If you want to pass in a path object, pandas accepts any <code class="docutils literal notranslate"><span class="pre">os.PathLike</span></code>.</p>
<p>By file-like object, we refer to objects with a <code class="docutils literal notranslate"><span class="pre">read()</span></code> method, such as
a file handle (e.g. via builtin <code class="docutils literal notranslate"><span class="pre">open</span></code> function) or <code class="docutils literal notranslate"><span class="pre">StringIO</span></code>.</p>
</li>
<li><strong>sep</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default '</em><em>,</em><em>'</em>) – Delimiter to use. If sep is None, the C engine cannot automatically detect
the separator, but the Python parsing engine can, meaning the latter will
be used and automatically detect the separator by Python’s builtin sniffer
tool, <code class="docutils literal notranslate"><span class="pre">csv.Sniffer</span></code>. In addition, separators longer than 1 character and
different from <code class="docutils literal notranslate"><span class="pre">'\s+'</span></code> will be interpreted as regular expressions and
will also force the use of the Python parsing engine. Note that regex
delimiters are prone to ignoring quoted data. Regex example: <code class="docutils literal notranslate"><span class="pre">'\r\t'</span></code>.</li>
<li><strong>delimiter</strong> (str, default <code class="docutils literal notranslate"><span class="pre">None</span></code>) – Alias for sep.</li>
<li><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>list of int</em><em>, </em><em>default 'infer'</em>) – Row number(s) to use as the column names, and the start of the
data. Default behavior is to infer the column names: if no names
are passed the behavior is identical to <code class="docutils literal notranslate"><span class="pre">header=0</span></code> and column
names are inferred from the first line of the file, if column
names are passed explicitly then the behavior is identical to
<code class="docutils literal notranslate"><span class="pre">header=None</span></code>. Explicitly pass <code class="docutils literal notranslate"><span class="pre">header=0</span></code> to be able to
replace existing names. The header can be a list of integers that
specify row locations for a multi-index on the columns
e.g. [0,1,3]. Intervening rows that are not specified will be
skipped (e.g. 2 in this example is skipped). Note that this
parameter ignores commented lines and empty lines if
<code class="docutils literal notranslate"><span class="pre">skip_blank_lines=True</span></code>, so <code class="docutils literal notranslate"><span class="pre">header=0</span></code> denotes the first line of
data rather than the first line of the file.</li>
<li><strong>names</strong> (<em>array-like</em><em>, </em><em>optional</em>) – List of column names to use. If the file contains a header row,
then you should explicitly pass <code class="docutils literal notranslate"><span class="pre">header=0</span></code> to override the column names.
Duplicates in this list are not allowed.</li>
<li><strong>index_col</strong> (int, str, sequence of int / str, or False, default <code class="docutils literal notranslate"><span class="pre">None</span></code>) – <p>Column(s) to use as the row labels of the <code class="docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code>, either given as
string name or column index. If a sequence of int / str is given, a
MultiIndex is used.</p>
<p>Note: <code class="docutils literal notranslate"><span class="pre">index_col=False</span></code> can be used to force pandas to <em>not</em> use the first
column as the index, e.g. when you have a malformed file with delimiters at
the end of each line.</p>
</li>
<li><strong>usecols</strong> (<em>list-like</em><em> or </em><em>callable</em><em>, </em><em>optional</em>) – <p>Return a subset of the columns. If list-like, all elements must either
be positional (i.e. integer indices into the document columns) or strings
that correspond to column names provided either by the user in <cite>names</cite> or
inferred from the document header row(s). For example, a valid list-like
<cite>usecols</cite> parameter would be <code class="docutils literal notranslate"><span class="pre">[0,</span> <span class="pre">1,</span> <span class="pre">2]</span></code> or <code class="docutils literal notranslate"><span class="pre">['foo',</span> <span class="pre">'bar',</span> <span class="pre">'baz']</span></code>.
Element order is ignored, so <code class="docutils literal notranslate"><span class="pre">usecols=[0,</span> <span class="pre">1]</span></code> is the same as <code class="docutils literal notranslate"><span class="pre">[1,</span> <span class="pre">0]</span></code>.
To instantiate a DeferredDataFrame from <code class="docutils literal notranslate"><span class="pre">data</span></code> with element order preserved use
<code class="docutils literal notranslate"><span class="pre">pd.read_csv(data,</span> <span class="pre">usecols=['foo',</span> <span class="pre">'bar'])[['foo',</span> <span class="pre">'bar']]</span></code> for columns
in <code class="docutils literal notranslate"><span class="pre">['foo',</span> <span class="pre">'bar']</span></code> order or
<code class="docutils literal notranslate"><span class="pre">pd.read_csv(data,</span> <span class="pre">usecols=['foo',</span> <span class="pre">'bar'])[['bar',</span> <span class="pre">'foo']]</span></code>
for <code class="docutils literal notranslate"><span class="pre">['bar',</span> <span class="pre">'foo']</span></code> order.</p>
<p>If callable, the callable function will be evaluated against the column
names, returning names where the callable function evaluates to True. An
example of a valid callable argument would be <code class="docutils literal notranslate"><span class="pre">lambda</span> <span class="pre">x:</span> <span class="pre">x.upper()</span> <span class="pre">in</span>
<span class="pre">['AAA',</span> <span class="pre">'BBB',</span> <span class="pre">'DDD']</span></code>. Using this parameter results in much faster
parsing time and lower memory usage.</p>
</li>
<li><strong>squeeze</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – If the parsed data only contains one column then return a DeferredSeries.</li>
<li><strong>prefix</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em>) – Prefix to add to column numbers when no header, e.g. ‘X’ for X0, X1, …</li>
<li><strong>mangle_dupe_cols</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Duplicate columns will be specified as ‘X’, ‘X.1’, …’X.N’, rather than
‘X’…’X’. Passing in False will cause data to be overwritten if there
are duplicate names in the columns.</li>
<li><strong>dtype</strong> (<em>Type name</em><em> or </em><em>dict of column -&gt; type</em><em>, </em><em>optional</em>) – Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32,
‘c’: ‘Int64’}
Use <cite>str</cite> or <cite>object</cite> together with suitable <cite>na_values</cite> settings
to preserve and not interpret dtype.
If converters are specified, they will be applied INSTEAD
of dtype conversion.</li>
<li><strong>engine</strong> (<em>{'c'</em><em>, </em><em>'python'}</em><em>, </em><em>optional</em>) – Parser engine to use. The C engine is faster while the python engine is
currently more feature-complete.</li>
<li><strong>converters</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>optional</em>) – Dict of functions for converting values in certain columns. Keys can either
be integers or column labels.</li>
<li><strong>true_values</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.10)"><em>list</em></a><em>, </em><em>optional</em>) – Values to consider as True.</li>
<li><strong>false_values</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.10)"><em>list</em></a><em>, </em><em>optional</em>) – Values to consider as False.</li>
<li><strong>skipinitialspace</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – Skip spaces after delimiter.</li>
<li><strong>skiprows</strong> (<em>list-like</em><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em> or </em><em>callable</em><em>, </em><em>optional</em>) – <p>Line numbers to skip (0-indexed) or number of lines to skip (int)
at the start of the file.</p>
<p>If callable, the callable function will be evaluated against the row
indices, returning True if the row should be skipped and False otherwise.
An example of a valid callable argument would be <code class="docutils literal notranslate"><span class="pre">lambda</span> <span class="pre">x:</span> <span class="pre">x</span> <span class="pre">in</span> <span class="pre">[0,</span> <span class="pre">2]</span></code>.</p>
</li>
<li><strong>skipfooter</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>default 0</em>) – Number of lines at bottom of file to skip (Unsupported with engine=’c’).</li>
<li><strong>nrows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>optional</em>) – Number of rows of file to read. Useful for reading pieces of large files.</li>
<li><strong>na_values</strong> (<em>scalar</em><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>list-like</em><em>, or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>optional</em>) – Additional strings to recognize as NA/NaN. If dict passed, specific
per-column NA values. By default the following values are interpreted as
NaN: ‘’, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, ‘-nan’,
‘1.#IND’, ‘1.#QNAN’, ‘&lt;NA&gt;’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘n/a’,
‘nan’, ‘null’.</li>
<li><strong>keep_default_na</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – <p>Whether or not to include the default NaN values when parsing the data.
Depending on whether <cite>na_values</cite> is passed in, the behavior is as follows:</p>
<ul>
<li>If <cite>keep_default_na</cite> is True, and <cite>na_values</cite> are specified, <cite>na_values</cite>
is appended to the default NaN values used for parsing.</li>
<li>If <cite>keep_default_na</cite> is True, and <cite>na_values</cite> are not specified, only
the default NaN values are used for parsing.</li>
<li>If <cite>keep_default_na</cite> is False, and <cite>na_values</cite> are specified, only
the NaN values specified <cite>na_values</cite> are used for parsing.</li>
<li>If <cite>keep_default_na</cite> is False, and <cite>na_values</cite> are not specified, no
strings will be parsed as NaN.</li>
</ul>
<p>Note that if <cite>na_filter</cite> is passed in as False, the <cite>keep_default_na</cite> and
<cite>na_values</cite> parameters will be ignored.</p>
</li>
<li><strong>na_filter</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Detect missing value markers (empty strings and the value of na_values). In
data without any NAs, passing na_filter=False can improve the performance
of reading a large file.</li>
<li><strong>verbose</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – Indicate number of NA values placed in non-numeric columns.</li>
<li><strong>skip_blank_lines</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – If True, skip over blank lines rather than interpreting as NaN values.</li>
<li><strong>parse_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em> or </em><em>list of int</em><em> or </em><em>names</em><em> or </em><em>list of lists</em><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>default False</em>) – <p>The behavior is as follows:</p>
<ul>
<li>boolean. If True -&gt; try parsing the index.</li>
<li>list of int or names. e.g. If [1, 2, 3] -&gt; try parsing columns 1, 2, 3
each as a separate date column.</li>
<li>list of lists. e.g. If [[1, 3]] -&gt; combine columns 1 and 3 and parse as
a single date column.</li>
<li>dict, e.g. {‘foo’ : [1, 3]} -&gt; parse columns 1, 3 as date and call
result ‘foo’</li>
</ul>
<p>If a column or index cannot be represented as an array of datetimes,
say because of an unparsable value or a mixture of timezones, the column
or index will be returned unaltered as an object data type. For
non-standard datetime parsing, use <code class="docutils literal notranslate"><span class="pre">pd.to_datetime</span></code> after
<code class="docutils literal notranslate"><span class="pre">pd.read_csv</span></code>. To parse an index or column with a mixture of timezones,
specify <code class="docutils literal notranslate"><span class="pre">date_parser</span></code> to be a partially-applied
<a class="reference external" href="https://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.to_datetime.html#pandas.to_datetime" title="(in pandas v1.5.0.dev0+279.g7651c08230)"><code class="xref py py-func docutils literal notranslate"><span class="pre">pandas.to_datetime()</span></code></a> with <code class="docutils literal notranslate"><span class="pre">utc=True</span></code>. See
<a class="reference external" href="https://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#io-csv-mixed-timezones" title="(in pandas v1.5.0.dev0+279.g7651c08230)"><span>Parsing a CSV with mixed timezones</span></a> for more.</p>
<p>Note: A fast-path exists for iso8601-formatted dates.</p>
</li>
<li><strong>infer_datetime_format</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – If True and <cite>parse_dates</cite> is enabled, pandas will attempt to infer the
format of the datetime strings in the columns, and if it can be inferred,
switch to a faster method of parsing them. In some cases this can increase
the parsing speed by 5-10x.</li>
<li><strong>keep_date_col</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – If True and <cite>parse_dates</cite> specifies combining multiple columns then
keep the original columns.</li>
<li><strong>date_parser</strong> (<em>function</em><em>, </em><em>optional</em>) – Function to use for converting a sequence of string columns to an array of
datetime instances. The default uses <code class="docutils literal notranslate"><span class="pre">dateutil.parser.parser</span></code> to do the
conversion. Pandas will try to call <cite>date_parser</cite> in three different ways,
advancing to the next if an exception occurs: 1) Pass one or more arrays
(as defined by <cite>parse_dates</cite>) as arguments; 2) concatenate (row-wise) the
string values from the columns defined by <cite>parse_dates</cite> into a single array
and pass that; and 3) call <cite>date_parser</cite> once for each row using one or
more strings (corresponding to the columns defined by <cite>parse_dates</cite>) as
arguments.</li>
<li><strong>dayfirst</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – DD/MM format dates, international and European format.</li>
<li><strong>cache_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – <p>If True, use a cache of unique, converted dates to apply the datetime
conversion. May produce significant speed-up when parsing duplicate
date strings, especially ones with timezone offsets.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.25.0.</span></p>
</div>
</li>
<li><strong>iterator</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – <p>Return TextFileReader object for iteration or getting chunks with
<code class="docutils literal notranslate"><span class="pre">get_chunk()</span></code>.</p>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.2: </span><code class="docutils literal notranslate"><span class="pre">TextFileReader</span></code> is a context manager.</p>
</div>
</li>
<li><strong>chunksize</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>optional</em>) – <p>Return TextFileReader object for iteration.
See the <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking">IO Tools docs</a>
for more information on <code class="docutils literal notranslate"><span class="pre">iterator</span></code> and <code class="docutils literal notranslate"><span class="pre">chunksize</span></code>.</p>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.2: </span><code class="docutils literal notranslate"><span class="pre">TextFileReader</span></code> is a context manager.</p>
</div>
</li>
<li><strong>compression</strong> (<em>{'infer'</em><em>, </em><em>'gzip'</em><em>, </em><em>'bz2'</em><em>, </em><em>'zip'</em><em>, </em><em>'xz'</em><em>, </em><em>None}</em><em>, </em><em>default 'infer'</em>) – For on-the-fly decompression of on-disk data. If ‘infer’ and
<cite>filepath_or_buffer</cite> is path-like, then detect compression from the
following extensions: ‘.gz’, ‘.bz2’, ‘.zip’, or ‘.xz’ (otherwise no
decompression). If using ‘zip’, the ZIP file must contain only one data
file to be read in. Set to None for no decompression.</li>
<li><strong>thousands</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em>) – Thousands separator.</li>
<li><strong>decimal</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default '.'</em>) – Character to recognize as decimal point (e.g. use ‘,’ for European data).</li>
<li><strong>lineterminator</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em> (</em><em>length 1</em><em>)</em><em>, </em><em>optional</em>) – Character to break file into lines. Only valid with C parser.</li>
<li><strong>quotechar</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em> (</em><em>length 1</em><em>)</em><em>, </em><em>optional</em>) – The character used to denote the start and end of a quoted item. Quoted
items can include the delimiter and it will be ignored.</li>
<li><strong>quoting</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em> or </em><em>csv.QUOTE_* instance</em><em>, </em><em>default 0</em>) – Control field quoting behavior per <code class="docutils literal notranslate"><span class="pre">csv.QUOTE_*</span></code> constants. Use one of
QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).</li>
<li><strong>doublequote</strong> (bool, default <code class="docutils literal notranslate"><span class="pre">True</span></code>) – When quotechar is specified and quoting is not <code class="docutils literal notranslate"><span class="pre">QUOTE_NONE</span></code>, indicate
whether or not to interpret two consecutive quotechar elements INSIDE a
field as a single <code class="docutils literal notranslate"><span class="pre">quotechar</span></code> element.</li>
<li><strong>escapechar</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em> (</em><em>length 1</em><em>)</em><em>, </em><em>optional</em>) – One-character string used to escape other characters.</li>
<li><strong>comment</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em>) – Indicates remainder of line should not be parsed. If found at the beginning
of a line, the line will be ignored altogether. This parameter must be a
single character. Like empty lines (as long as <code class="docutils literal notranslate"><span class="pre">skip_blank_lines=True</span></code>),
fully commented lines are ignored by the parameter <cite>header</cite> but not by
<cite>skiprows</cite>. For example, if <code class="docutils literal notranslate"><span class="pre">comment='#'</span></code>, parsing
<code class="docutils literal notranslate"><span class="pre">#empty\na,b,c\n1,2,3</span></code> with <code class="docutils literal notranslate"><span class="pre">header=0</span></code> will result in ‘a,b,c’ being
treated as the header.</li>
<li><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em>) – <p>Encoding to use for UTF when reading/writing (ex. ‘utf-8’). <a class="reference external" href="https://docs.python.org/3/library/codecs.html#standard-encodings">List of Python
standard encodings</a> .</p>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.2: </span>When <code class="docutils literal notranslate"><span class="pre">encoding</span></code> is <code class="docutils literal notranslate"><span class="pre">None</span></code>, <code class="docutils literal notranslate"><span class="pre">errors=&quot;replace&quot;</span></code> is passed to
<code class="docutils literal notranslate"><span class="pre">open()</span></code>. Otherwise, <code class="docutils literal notranslate"><span class="pre">errors=&quot;strict&quot;</span></code> is passed to <code class="docutils literal notranslate"><span class="pre">open()</span></code>.
This behavior was previously only the case for <code class="docutils literal notranslate"><span class="pre">engine=&quot;python&quot;</span></code>.</p>
</div>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.3.0: </span><code class="docutils literal notranslate"><span class="pre">encoding_errors</span></code> is a new argument. <code class="docutils literal notranslate"><span class="pre">encoding</span></code> has no longer an
influence on how encoding errors are handled.</p>
</div>
</li>
<li><strong>encoding_errors</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em><em>, </em><em>default &quot;strict&quot;</em>) – <p>How encoding errors are treated. <a class="reference external" href="https://docs.python.org/3/library/codecs.html#error-handlers">List of possible values</a> .</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</li>
<li><strong>dialect</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/csv.html#csv.Dialect" title="(in Python v3.10)"><em>csv.Dialect</em></a><em>, </em><em>optional</em>) – If provided, this parameter will override values (default or not) for the
following parameters: <cite>delimiter</cite>, <cite>doublequote</cite>, <cite>escapechar</cite>,
<cite>skipinitialspace</cite>, <cite>quotechar</cite>, and <cite>quoting</cite>. If it is necessary to
override values, a ParserWarning will be issued. See csv.Dialect
documentation for more details.</li>
<li><strong>error_bad_lines</strong> (bool, default <code class="docutils literal notranslate"><span class="pre">None</span></code>) – <p>Lines with too many fields (e.g. a csv line with too many commas) will by
default cause an exception to be raised, and no DeferredDataFrame will be returned.
If False, then these “bad lines” will be dropped from the DeferredDataFrame that is
returned.</p>
<div class="deprecated">
<p><span class="versionmodified">Deprecated since version 1.3.0: </span>The <code class="docutils literal notranslate"><span class="pre">on_bad_lines</span></code> parameter should be used instead to specify behavior upon
encountering a bad line instead.</p>
</div>
</li>
<li><strong>warn_bad_lines</strong> (bool, default <code class="docutils literal notranslate"><span class="pre">None</span></code>) – <p>If error_bad_lines is False, and warn_bad_lines is True, a warning for each
“bad line” will be output.</p>
<div class="deprecated">
<p><span class="versionmodified">Deprecated since version 1.3.0: </span>The <code class="docutils literal notranslate"><span class="pre">on_bad_lines</span></code> parameter should be used instead to specify behavior upon
encountering a bad line instead.</p>
</div>
</li>
<li><strong>on_bad_lines</strong> (<em>{'error'</em><em>, </em><em>'warn'</em><em>, </em><em>'skip'}</em><em>, </em><em>default 'error'</em>) – <p>Specifies what to do upon encountering a bad line (a line with too many fields).
Allowed values are :</p>
<blockquote>
<div><ul>
<li>’error’, raise an Exception when a bad line is encountered.</li>
<li>’warn’, raise a warning when a bad line is encountered and skip that line.</li>
<li>’skip’, skip bad lines without raising or warning when they are encountered.</li>
</ul>
</div></blockquote>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</li>
<li><strong>delim_whitespace</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – Specifies whether or not whitespace (e.g. <code class="docutils literal notranslate"><span class="pre">'</span> <span class="pre">'</span></code> or <code class="docutils literal notranslate"><span class="pre">'</span>&#160;&#160;&#160; <span class="pre">'</span></code>) will be
used as the sep. Equivalent to setting <code class="docutils literal notranslate"><span class="pre">sep='\s+'</span></code>. If this option
is set to True, nothing should be passed in for the <code class="docutils literal notranslate"><span class="pre">delimiter</span></code>
parameter.</li>
<li><strong>low_memory</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Internally process the file in chunks, resulting in lower memory use
while parsing, but possibly mixed type inference. To ensure no mixed
types either set False, or specify the type with the <cite>dtype</cite> parameter.
Note that the entire file is read into a single DeferredDataFrame regardless,
use the <cite>chunksize</cite> or <cite>iterator</cite> parameter to return the data in chunks.
(Only valid with C parser).</li>
<li><strong>memory_map</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – If a filepath is provided for <cite>filepath_or_buffer</cite>, map the file object
directly onto memory and access the data directly from there. Using this
option can improve performance because there is no longer any I/O overhead.</li>
<li><strong>float_precision</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em>) – <p>Specifies which converter the C engine should use for floating-point
values. The options are <code class="docutils literal notranslate"><span class="pre">None</span></code> or ‘high’ for the ordinary converter,
‘legacy’ for the original lower precision pandas converter, and
‘round_trip’ for the round-trip converter.</p>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.2.</span></p>
</div>
</li>
<li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g.
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib</span></code> as header options. For other URLs (e.g.
starting with “s3://”, and “gcs://”) the key-value pairs are forwarded to
<code class="docutils literal notranslate"><span class="pre">fsspec</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more details.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.</span></p>
</div>
</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">A comma-separated values (csv) file is returned as two-dimensional
data structure with labeled axes.</p>
</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a> or TextParser</p>
</td>
</tr>
</tbody>
</table>
<p class="rubric">Differences from pandas</p>
<p>If your files are large and records do not contain quoted newlines, you may
pass the extra argument <code class="docutils literal notranslate"><span class="pre">splittable=True</span></code> to enable dynamic splitting for
this read on newlines. Using this option for records that do contain quoted
newlines may result in partial records and data corruption.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<dl class="last docutils">
<dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_csv()</span></code></dt>
<dd>Write DeferredDataFrame to a comma-separated values (csv) file.</dd>
<dt><a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a></dt>
<dd>Read a comma-separated values (csv) file into DeferredDataFrame.</dd>
<dt><a class="reference internal" href="#apache_beam.dataframe.io.read_fwf" title="apache_beam.dataframe.io.read_fwf"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_fwf()</span></code></a></dt>
<dd>Read a table of fixed-width formatted lines into DeferredDataFrame.</dd>
</dl>
</div>
<p class="rubric">Examples</p>
<p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API. In addition, some arguments shown here may not be supported, see <strong>‘Differences from pandas’</strong> for details.</p>
<div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;data.csv&#39;</span><span class="p">)</span>
</pre></div>
</div>
</dd></dl>
<dl class="function">
<dt id="apache_beam.dataframe.io.to_csv">
<code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">to_csv</code><span class="sig-paren">(</span><em>df</em>, <em>path</em>, <em>transform_label=None</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#to_csv"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.to_csv" title="Permalink to this definition"></a></dt>
<dd><p>Write object to a comma-separated values (csv) file.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>path_or_buf</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em> or </em><em>file handle</em><em>, </em><em>default None</em>) – <p>File path or object, if None is provided the result is returned as
a string. If a non-binary file object is passed, it should be opened
with <cite>newline=’’</cite>, disabling universal newlines. If a binary
file object is passed, <cite>mode</cite> might need to contain a <cite>‘b’</cite>.</p>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.2.0: </span>Support for binary file objects was introduced.</p>
</div>
</li>
<li><strong>sep</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default '</em><em>,</em><em>'</em>) – String of length 1. Field delimiter for the output file.</li>
<li><strong>na_rep</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default ''</em>) – Missing data representation.</li>
<li><strong>float_format</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default None</em>) – Format string for floating point numbers.</li>
<li><strong>columns</strong> (<em>sequence</em><em>, </em><em>optional</em>) – Columns to write.</li>
<li><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em> or </em><em>list of str</em><em>, </em><em>default True</em>) – Write out the column names. If a list of strings is given it is
assumed to be aliases for the column names.</li>
<li><strong>index</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Write row names (index).</li>
<li><strong>index_label</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em> or </em><em>sequence</em><em>, or </em><em>False</em><em>, </em><em>default None</em>) – Column label for index column(s) if desired. If None is given, and
<cite>header</cite> and <cite>index</cite> are True, then the index names are used. A
sequence should be given if the object uses MultiIndex. If
False do not print fields for index names. Use index_label=False
for easier importing in R.</li>
<li><strong>mode</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a>) – Python write mode, default ‘w’.</li>
<li><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em>) – A string representing the encoding to use in the output file,
defaults to ‘utf-8’. <cite>encoding</cite> is not supported if <cite>path_or_buf</cite>
is a non-binary file object.</li>
<li><strong>compression</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>default 'infer'</em>) – <p>If str, represents compression mode. If dict, value at ‘method’ is
the compression mode. Compression mode may be any of the following
possible values: {‘infer’, ‘gzip’, ‘bz2’, ‘zip’, ‘xz’, None}. If
compression mode is ‘infer’ and <cite>path_or_buf</cite> is path-like, then
detect compression mode from the following extensions: ‘.gz’,
‘.bz2’, ‘.zip’ or ‘.xz’. (otherwise no compression). If dict given
and mode is one of {‘zip’, ‘gzip’, ‘bz2’}, or inferred as
one of the above, other entries passed as
additional compression options.</p>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.0.0: </span>May now be a dict with key ‘method’ as compression mode
and other entries as additional compression options if
compression mode is ‘zip’.</p>
</div>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.1.0: </span>Passing compression options as keys in dict is
supported for compression modes ‘gzip’ and ‘bz2’
as well as ‘zip’.</p>
</div>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.2.0: </span>Compression is supported for binary file objects.</p>
</div>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.2.0: </span>Previous versions forwarded dict entries for ‘gzip’ to
<cite>gzip.open</cite> instead of <cite>gzip.GzipFile</cite> which prevented
setting <cite>mtime</cite>.</p>
</div>
</li>
<li><strong>quoting</strong> (<em>optional constant from csv module</em>) – Defaults to csv.QUOTE_MINIMAL. If you have set a <cite>float_format</cite>
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
will treat them as non-numeric.</li>
<li><strong>quotechar</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default '&quot;'</em>) – String of length 1. Character used to quote fields.</li>
<li><strong>line_terminator</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em>) – The newline character or character sequence to use in the output
file. Defaults to <cite>os.linesep</cite>, which depends on the OS in which
this method is called (‘\n’ for linux, ‘\r\n’ for Windows, i.e.).</li>
<li><strong>chunksize</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.10)"><em>None</em></a>) – Rows to write at a time.</li>
<li><strong>date_format</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default None</em>) – Format string for datetime objects.</li>
<li><strong>doublequote</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Control quoting of <cite>quotechar</cite> inside a field.</li>
<li><strong>escapechar</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default None</em>) – String of length 1. Character used to escape <cite>sep</cite> and <cite>quotechar</cite>
when appropriate.</li>
<li><strong>decimal</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default '.'</em>) – Character recognized as decimal separator. E.g. use ‘,’ for
European data.</li>
<li><strong>errors</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default 'strict'</em>) – <p>Specifies how encoding and decoding errors are to be handled.
See the errors argument for <a class="reference external" href="https://docs.python.org/3/library/functions.html#open" title="(in Python v3.10)"><code class="xref py py-func docutils literal notranslate"><span class="pre">open()</span></code></a> for a full list
of options.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
</li>
<li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g.
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib</span></code> as header options. For other URLs (e.g.
starting with “s3://”, and “gcs://”) the key-value pairs are forwarded to
<code class="docutils literal notranslate"><span class="pre">fsspec</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more details.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">If path_or_buf is None, returns the resulting csv format as a
string. Otherwise returns None.</p>
</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.10)">None</a> or <a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)">str</a></p>
</td>
</tr>
</tbody>
</table>
<p class="rubric">Differences from pandas</p>
<p>This operation has no known divergences from the pandas API.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<dl class="last docutils">
<dt><a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a></dt>
<dd>Load a CSV file into a DeferredDataFrame.</dd>
<dt><a class="reference internal" href="#apache_beam.dataframe.io.to_excel" title="apache_beam.dataframe.io.to_excel"><code class="xref py py-func docutils literal notranslate"><span class="pre">to_excel()</span></code></a></dt>
<dd>Write DeferredDataFrame to an Excel file.</dd>
</dl>
</div>
<p class="rubric">Examples</p>
<p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p>
<div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;name&#39;</span><span class="p">:</span> <span class="p">[</span><span class="s1">&#39;Raphael&#39;</span><span class="p">,</span> <span class="s1">&#39;Donatello&#39;</span><span class="p">],</span>
<span class="gp">... </span> <span class="s1">&#39;mask&#39;</span><span class="p">:</span> <span class="p">[</span><span class="s1">&#39;red&#39;</span><span class="p">,</span> <span class="s1">&#39;purple&#39;</span><span class="p">],</span>
<span class="gp">... </span> <span class="s1">&#39;weapon&#39;</span><span class="p">:</span> <span class="p">[</span><span class="s1">&#39;sai&#39;</span><span class="p">,</span> <span class="s1">&#39;bo staff&#39;</span><span class="p">]})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="go">&#39;name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n&#39;</span>
<span class="go">Create &#39;out.zip&#39; containing &#39;out.csv&#39;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">compression_opts</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s1">&#39;zip&#39;</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">archive_name</span><span class="o">=</span><span class="s1">&#39;out.csv&#39;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="s1">&#39;out.zip&#39;</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">compression</span><span class="o">=</span><span class="n">compression_opts</span><span class="p">)</span>
</pre></div>
</div>
</dd></dl>
<dl class="function">
<dt id="apache_beam.dataframe.io.read_fwf">
<code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_fwf</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#read_fwf"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.read_fwf" title="Permalink to this definition"></a></dt>
<dd><p>Read a table of fixed-width formatted lines into DataFrame.</p>
<p>Also supports optionally iterating or breaking of the file
into chunks.</p>
<p>Additional help can be found in the <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html">online docs for IO Tools</a>.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>filepath_or_buffer</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>path object</em><em> or </em><em>file-like object</em>) – <p>Any valid string path is acceptable. The string could be a URL. Valid
URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be:
<code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.csv</span></code>.</p>
<p>If you want to pass in a path object, pandas accepts any
<code class="docutils literal notranslate"><span class="pre">os.PathLike</span></code>.</p>
<p>By file-like object, we refer to objects with a <code class="docutils literal notranslate"><span class="pre">read()</span></code> method,
such as a file handle (e.g. via builtin <code class="docutils literal notranslate"><span class="pre">open</span></code> function)
or <code class="docutils literal notranslate"><span class="pre">StringIO</span></code>.</p>
</li>
<li><strong>colspecs</strong> (<em>list of tuple</em><em> (</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>) or </em><em>'infer'. optional</em>) – A list of tuples giving the extents of the fixed-width
fields of each line as half-open intervals (i.e., [from, to[ ).
String value ‘infer’ can be used to instruct the parser to try
detecting the column specifications from the first 100 rows of
the data which are not being skipped via skiprows (default=’infer’).</li>
<li><strong>widths</strong> (<em>list of int</em><em>, </em><em>optional</em>) – A list of field widths which can be used instead of ‘colspecs’ if
the intervals are contiguous.</li>
<li><strong>infer_nrows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>default 100</em>) – The number of rows to consider when letting the parser determine the
<cite>colspecs</cite>.</li>
<li><strong>**kwds</strong> (<em>optional</em>) – Optional keyword arguments can be passed to <code class="docutils literal notranslate"><span class="pre">TextFileReader</span></code>.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">A comma-separated values (csv) file is returned as two-dimensional
data structure with labeled axes.</p>
</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a> or TextParser</p>
</td>
</tr>
</tbody>
</table>
<p class="rubric">Differences from pandas</p>
<p>This operation has no known divergences from the pandas API.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<dl class="last docutils">
<dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_csv()</span></code></dt>
<dd>Write DeferredDataFrame to a comma-separated values (csv) file.</dd>
<dt><a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a></dt>
<dd>Read a comma-separated values (csv) file into DeferredDataFrame.</dd>
</dl>
</div>
<p class="rubric">Examples</p>
<p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p>
<div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">pd</span><span class="o">.</span><span class="n">read_fwf</span><span class="p">(</span><span class="s1">&#39;data.csv&#39;</span><span class="p">)</span>
</pre></div>
</div>
</dd></dl>
<dl class="function">
<dt id="apache_beam.dataframe.io.read_json">
<code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_json</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#read_json"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.read_json" title="Permalink to this definition"></a></dt>
<dd><p>Convert a JSON string to pandas object.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>path_or_buf</strong> (<em>a valid JSON str</em><em>, </em><em>path object</em><em> or </em><em>file-like object</em>) – <p>Any valid string path is acceptable. The string could be a URL. Valid
URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be:
<code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.json</span></code>.</p>
<p>If you want to pass in a path object, pandas accepts any
<code class="docutils literal notranslate"><span class="pre">os.PathLike</span></code>.</p>
<p>By file-like object, we refer to objects with a <code class="docutils literal notranslate"><span class="pre">read()</span></code> method,
such as a file handle (e.g. via builtin <code class="docutils literal notranslate"><span class="pre">open</span></code> function)
or <code class="docutils literal notranslate"><span class="pre">StringIO</span></code>.</p>
</li>
<li><strong>orient</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a>) – <p>Indication of expected JSON string format.
Compatible JSON strings can be produced by <code class="docutils literal notranslate"><span class="pre">to_json()</span></code> with a
corresponding orient value.
The set of possible orients is:</p>
<ul>
<li><code class="docutils literal notranslate"><span class="pre">'split'</span></code> : dict like
<code class="docutils literal notranslate"><span class="pre">{index</span> <span class="pre">-&gt;</span> <span class="pre">[index],</span> <span class="pre">columns</span> <span class="pre">-&gt;</span> <span class="pre">[columns],</span> <span class="pre">data</span> <span class="pre">-&gt;</span> <span class="pre">[values]}</span></code></li>
<li><code class="docutils literal notranslate"><span class="pre">'records'</span></code> : list like
<code class="docutils literal notranslate"><span class="pre">[{column</span> <span class="pre">-&gt;</span> <span class="pre">value},</span> <span class="pre">...</span> <span class="pre">,</span> <span class="pre">{column</span> <span class="pre">-&gt;</span> <span class="pre">value}]</span></code></li>
<li><code class="docutils literal notranslate"><span class="pre">'index'</span></code> : dict like <code class="docutils literal notranslate"><span class="pre">{index</span> <span class="pre">-&gt;</span> <span class="pre">{column</span> <span class="pre">-&gt;</span> <span class="pre">value}}</span></code></li>
<li><code class="docutils literal notranslate"><span class="pre">'columns'</span></code> : dict like <code class="docutils literal notranslate"><span class="pre">{column</span> <span class="pre">-&gt;</span> <span class="pre">{index</span> <span class="pre">-&gt;</span> <span class="pre">value}}</span></code></li>
<li><code class="docutils literal notranslate"><span class="pre">'values'</span></code> : just the values array</li>
</ul>
<p>The allowed and default values depend on the value
of the <cite>typ</cite> parameter.</p>
<ul>
<li>when <code class="docutils literal notranslate"><span class="pre">typ</span> <span class="pre">==</span> <span class="pre">'series'</span></code>,<ul>
<li>allowed orients are <code class="docutils literal notranslate"><span class="pre">{'split','records','index'}</span></code></li>
<li>default is <code class="docutils literal notranslate"><span class="pre">'index'</span></code></li>
<li>The DeferredSeries index must be unique for orient <code class="docutils literal notranslate"><span class="pre">'index'</span></code>.</li>
</ul>
</li>
<li>when <code class="docutils literal notranslate"><span class="pre">typ</span> <span class="pre">==</span> <span class="pre">'frame'</span></code>,<ul>
<li>allowed orients are <code class="docutils literal notranslate"><span class="pre">{'split','records','index',</span>
<span class="pre">'columns','values',</span> <span class="pre">'table'}</span></code></li>
<li>default is <code class="docutils literal notranslate"><span class="pre">'columns'</span></code></li>
<li>The DeferredDataFrame index must be unique for orients <code class="docutils literal notranslate"><span class="pre">'index'</span></code> and
<code class="docutils literal notranslate"><span class="pre">'columns'</span></code>.</li>
<li>The DeferredDataFrame columns must be unique for orients <code class="docutils literal notranslate"><span class="pre">'index'</span></code>,
<code class="docutils literal notranslate"><span class="pre">'columns'</span></code>, and <code class="docutils literal notranslate"><span class="pre">'records'</span></code>.</li>
</ul>
</li>
</ul>
</li>
<li><strong>typ</strong> (<em>{'frame'</em><em>, </em><em>'series'}</em><em>, </em><em>default 'frame'</em>) – The type of object to recover.</li>
<li><strong>dtype</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>default None</em>) – <p>If True, infer dtypes; if a dict of column to dtype, then use those;
if False, then don’t infer dtypes at all, applies only to the data.</p>
<p>For all <code class="docutils literal notranslate"><span class="pre">orient</span></code> values except <code class="docutils literal notranslate"><span class="pre">'table'</span></code>, default is True.</p>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 0.25.0: </span>Not applicable for <code class="docutils literal notranslate"><span class="pre">orient='table'</span></code>.</p>
</div>
</li>
<li><strong>convert_axes</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default None</em>) – <p>Try to convert the axes to the proper dtypes.</p>
<p>For all <code class="docutils literal notranslate"><span class="pre">orient</span></code> values except <code class="docutils literal notranslate"><span class="pre">'table'</span></code>, default is True.</p>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 0.25.0: </span>Not applicable for <code class="docutils literal notranslate"><span class="pre">orient='table'</span></code>.</p>
</div>
</li>
<li><strong>convert_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em> or </em><em>list of str</em><em>, </em><em>default True</em>) – If True then default datelike columns may be converted (depending on
keep_default_dates).
If False, no dates will be converted.
If a list of column names, then those columns will be converted and
default datelike columns may also be converted (depending on
keep_default_dates).</li>
<li><strong>keep_default_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – <p>If parsing dates (convert_dates is not False), then try to parse the
default datelike columns.
A column label is datelike if</p>
<ul>
<li>it ends with <code class="docutils literal notranslate"><span class="pre">'_at'</span></code>,</li>
<li>it ends with <code class="docutils literal notranslate"><span class="pre">'_time'</span></code>,</li>
<li>it begins with <code class="docutils literal notranslate"><span class="pre">'timestamp'</span></code>,</li>
<li>it is <code class="docutils literal notranslate"><span class="pre">'modified'</span></code>, or</li>
<li>it is <code class="docutils literal notranslate"><span class="pre">'date'</span></code>.</li>
</ul>
</li>
<li><strong>numpy</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – <p>Direct decoding to numpy arrays. Supports numeric data only, but
non-numeric column and index labels are supported. Note also that the
JSON ordering MUST be the same for each term if numpy=True.</p>
<div class="deprecated">
<p><span class="versionmodified">Deprecated since version 1.0.0.</span></p>
</div>
</li>
<li><strong>precise_float</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – Set to enable usage of higher precision (strtod) function when
decoding string to double values. Default (False) is to use fast but
less precise builtin functionality.</li>
<li><strong>date_unit</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default None</em>) – The timestamp unit to detect if converting dates. The default behaviour
is to try and detect the correct precision, but if this is not desired
then pass one of ‘s’, ‘ms’, ‘us’ or ‘ns’ to force parsing only seconds,
milliseconds, microseconds or nanoseconds respectively.</li>
<li><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default is 'utf-8'</em>) – The encoding to use to decode py3 bytes.</li>
<li><strong>encoding_errors</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em><em>, </em><em>default &quot;strict&quot;</em>) – <p>How encoding errors are treated. <a class="reference external" href="https://docs.python.org/3/library/codecs.html#error-handlers">List of possible values</a> .</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</li>
<li><strong>lines</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – Read the file as a json object per line.</li>
<li><strong>chunksize</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>optional</em>) – <p>Return JsonReader object for iteration.
See the <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#line-delimited-json">line-delimited json docs</a>
for more information on <code class="docutils literal notranslate"><span class="pre">chunksize</span></code>.
This can only be passed if <cite>lines=True</cite>.
If this is None, the file will be read into memory all at once.</p>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.2: </span><code class="docutils literal notranslate"><span class="pre">JsonReader</span></code> is a context manager.</p>
</div>
</li>
<li><strong>compression</strong> (<em>{'infer'</em><em>, </em><em>'gzip'</em><em>, </em><em>'bz2'</em><em>, </em><em>'zip'</em><em>, </em><em>'xz'</em><em>, </em><em>None}</em><em>, </em><em>default 'infer'</em>) – For on-the-fly decompression of on-disk data. If ‘infer’, then use
gzip, bz2, zip or xz if path_or_buf is a string ending in
‘.gz’, ‘.bz2’, ‘.zip’, or ‘xz’, respectively, and no decompression
otherwise. If using ‘zip’, the ZIP file must contain only one data
file to be read in. Set to None for no decompression.</li>
<li><strong>nrows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>optional</em>) – <p>The number of lines from the line-delimited jsonfile that has to be read.
This can only be passed if <cite>lines=True</cite>.
If this is None, all the rows will be returned.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.</span></p>
</div>
</li>
<li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g.
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib</span></code> as header options. For other URLs (e.g.
starting with “s3://”, and “gcs://”) the key-value pairs are forwarded to
<code class="docutils literal notranslate"><span class="pre">fsspec</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more details.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">The type returned depends on the value of <cite>typ</cite>.</p>
</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredSeries" title="apache_beam.dataframe.frames.DeferredSeries">DeferredSeries</a> or <a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a></p>
</td>
</tr>
</tbody>
</table>
<p class="rubric">Differences from pandas</p>
<p>This operation has no known divergences from the pandas API.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<dl class="last docutils">
<dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_json()</span></code></dt>
<dd>Convert a DeferredDataFrame to a JSON string.</dd>
<dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredSeries.to_json()</span></code></dt>
<dd>Convert a DeferredSeries to a JSON string.</dd>
</dl>
</div>
<p class="rubric">Notes</p>
<p>Specific to <code class="docutils literal notranslate"><span class="pre">orient='table'</span></code>, if a <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> with a literal
<code class="xref py py-class docutils literal notranslate"><span class="pre">Index</span></code> name of <cite>index</cite> gets written with <a class="reference internal" href="#apache_beam.dataframe.io.to_json" title="apache_beam.dataframe.io.to_json"><code class="xref py py-func docutils literal notranslate"><span class="pre">to_json()</span></code></a>, the
subsequent read operation will incorrectly set the <code class="xref py py-class docutils literal notranslate"><span class="pre">Index</span></code> name to
<code class="docutils literal notranslate"><span class="pre">None</span></code>. This is because <cite>index</cite> is also used by <code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_json()</span></code>
to denote a missing <code class="xref py py-class docutils literal notranslate"><span class="pre">Index</span></code> name, and the subsequent
<a class="reference internal" href="#apache_beam.dataframe.io.read_json" title="apache_beam.dataframe.io.read_json"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_json()</span></code></a> operation cannot distinguish between the two. The same
limitation is encountered with a <code class="xref py py-class docutils literal notranslate"><span class="pre">MultiIndex</span></code> and any names
beginning with <code class="docutils literal notranslate"><span class="pre">'level_'</span></code>.</p>
<p class="rubric">Examples</p>
<p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p>
<div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">([[</span><span class="s1">&#39;a&#39;</span><span class="p">,</span> <span class="s1">&#39;b&#39;</span><span class="p">],</span> <span class="p">[</span><span class="s1">&#39;c&#39;</span><span class="p">,</span> <span class="s1">&#39;d&#39;</span><span class="p">]],</span>
<span class="gp">... </span> <span class="n">index</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;row 1&#39;</span><span class="p">,</span> <span class="s1">&#39;row 2&#39;</span><span class="p">],</span>
<span class="gp">... </span> <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;col 1&#39;</span><span class="p">,</span> <span class="s1">&#39;col 2&#39;</span><span class="p">])</span>
<span class="go">Encoding/decoding a Dataframe using ``&#39;split&#39;`` formatted JSON:</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s1">&#39;split&#39;</span><span class="p">)</span>
<span class="go"> &#39;{&quot;columns&quot;:[&quot;col 1&quot;,&quot;col 2&quot;],&quot;index&quot;:[&quot;row 1&quot;,&quot;row 2&quot;],&quot;data&quot;:[[&quot;a&quot;,&quot;b&quot;],[&quot;c&quot;,&quot;d&quot;]]}&#39;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pd</span><span class="o">.</span><span class="n">read_json</span><span class="p">(</span><span class="n">_</span><span class="p">,</span> <span class="n">orient</span><span class="o">=</span><span class="s1">&#39;split&#39;</span><span class="p">)</span>
<span class="go"> col 1 col 2</span>
<span class="go">row 1 a b</span>
<span class="go">row 2 c d</span>
<span class="go">Encoding/decoding a Dataframe using ``&#39;index&#39;`` formatted JSON:</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s1">&#39;index&#39;</span><span class="p">)</span>
<span class="go">&#39;{&quot;row 1&quot;:{&quot;col 1&quot;:&quot;a&quot;,&quot;col 2&quot;:&quot;b&quot;},&quot;row 2&quot;:{&quot;col 1&quot;:&quot;c&quot;,&quot;col 2&quot;:&quot;d&quot;}}&#39;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pd</span><span class="o">.</span><span class="n">read_json</span><span class="p">(</span><span class="n">_</span><span class="p">,</span> <span class="n">orient</span><span class="o">=</span><span class="s1">&#39;index&#39;</span><span class="p">)</span>
<span class="go"> col 1 col 2</span>
<span class="go">row 1 a b</span>
<span class="go">row 2 c d</span>
<span class="go">Encoding/decoding a Dataframe using ``&#39;records&#39;`` formatted JSON.</span>
<span class="go">Note that index labels are not preserved with this encoding.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s1">&#39;records&#39;</span><span class="p">)</span>
<span class="go">&#39;[{&quot;col 1&quot;:&quot;a&quot;,&quot;col 2&quot;:&quot;b&quot;},{&quot;col 1&quot;:&quot;c&quot;,&quot;col 2&quot;:&quot;d&quot;}]&#39;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pd</span><span class="o">.</span><span class="n">read_json</span><span class="p">(</span><span class="n">_</span><span class="p">,</span> <span class="n">orient</span><span class="o">=</span><span class="s1">&#39;records&#39;</span><span class="p">)</span>
<span class="go"> col 1 col 2</span>
<span class="go">0 a b</span>
<span class="go">1 c d</span>
<span class="go">Encoding with Table Schema</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s1">&#39;table&#39;</span><span class="p">)</span>
<span class="go"> &#39;{&quot;schema&quot;:{&quot;fields&quot;:[{&quot;name&quot;:&quot;index&quot;,&quot;type&quot;:&quot;string&quot;},{&quot;name&quot;:&quot;col 1&quot;,&quot;type&quot;:&quot;string&quot;},{&quot;name&quot;:&quot;col 2&quot;,&quot;type&quot;:&quot;string&quot;}],&quot;primaryKey&quot;:[&quot;index&quot;],&quot;pandas_version&quot;:&quot;0.20.0&quot;},&quot;data&quot;:[{&quot;index&quot;:&quot;row 1&quot;,&quot;col 1&quot;:&quot;a&quot;,&quot;col 2&quot;:&quot;b&quot;},{&quot;index&quot;:&quot;row 2&quot;,&quot;col 1&quot;:&quot;c&quot;,&quot;col 2&quot;:&quot;d&quot;}]}&#39;</span>
</pre></div>
</div>
</dd></dl>
<dl class="function">
<dt id="apache_beam.dataframe.io.to_json">
<code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">to_json</code><span class="sig-paren">(</span><em>df</em>, <em>path</em>, <em>orient=None</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#to_json"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.to_json" title="Permalink to this definition"></a></dt>
<dd><p>Convert the object to a JSON string.</p>
<p>Note NaN’s and None will be converted to null and datetime objects
will be converted to UNIX timestamps.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>path_or_buf</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em> or </em><em>file handle</em><em>, </em><em>optional</em>) – File path or object. If not specified, the result is returned as
a string.</li>
<li><strong>orient</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a>) – <p>Indication of expected JSON string format.</p>
<ul>
<li>DeferredSeries:<blockquote>
<div><ul>
<li>default is ‘index’</li>
<li>allowed values are: {‘split’, ‘records’, ‘index’, ‘table’}.</li>
</ul>
</div></blockquote>
</li>
<li>DeferredDataFrame:<blockquote>
<div><ul>
<li>default is ‘columns’</li>
<li>allowed values are: {‘split’, ‘records’, ‘index’, ‘columns’,
‘values’, ‘table’}.</li>
</ul>
</div></blockquote>
</li>
<li>The format of the JSON string:<blockquote>
<div><ul>
<li>’split’ : dict like {‘index’ -&gt; [index], ‘columns’ -&gt; [columns],
‘data’ -&gt; [values]}</li>
<li>’records’ : list like [{column -&gt; value}, … , {column -&gt; value}]</li>
<li>’index’ : dict like {index -&gt; {column -&gt; value}}</li>
<li>’columns’ : dict like {column -&gt; {index -&gt; value}}</li>
<li>’values’ : just the values array</li>
<li>’table’ : dict like {‘schema’: {schema}, ‘data’: {data}}</li>
</ul>
<p>Describing the data, where data component is like <code class="docutils literal notranslate"><span class="pre">orient='records'</span></code>.</p>
</div></blockquote>
</li>
</ul>
</li>
<li><strong>date_format</strong> (<em>{None</em><em>, </em><em>'epoch'</em><em>, </em><em>'iso'}</em>) – Type of date conversion. ‘epoch’ = epoch milliseconds,
‘iso’ = ISO8601. The default depends on the <cite>orient</cite>. For
<code class="docutils literal notranslate"><span class="pre">orient='table'</span></code>, the default is ‘iso’. For all other orients,
the default is ‘epoch’.</li>
<li><strong>double_precision</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>default 10</em>) – The number of decimal places to use when encoding
floating point values.</li>
<li><strong>force_ascii</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Force encoded string to be ASCII.</li>
<li><strong>date_unit</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default 'ms'</em><em> (</em><em>milliseconds</em><em>)</em>) – The time unit to encode to, governs timestamp and ISO8601
precision. One of ‘s’, ‘ms’, ‘us’, ‘ns’ for second, millisecond,
microsecond, and nanosecond respectively.</li>
<li><strong>default_handler</strong> (<em>callable</em><em>, </em><em>default None</em>) – Handler to call if object cannot otherwise be converted to a
suitable format for JSON. Should receive a single argument which is
the object to convert and return a serialisable object.</li>
<li><strong>lines</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – If ‘orient’ is ‘records’ write out line-delimited json format. Will
throw ValueError if incorrect ‘orient’ since others are not
list-like.</li>
<li><strong>compression</strong> (<em>{'infer'</em><em>, </em><em>'gzip'</em><em>, </em><em>'bz2'</em><em>, </em><em>'zip'</em><em>, </em><em>'xz'</em><em>, </em><em>None}</em>) – A string representing the compression to use in the output file,
only used when the first argument is a filename. By default, the
compression is inferred from the filename.</li>
<li><strong>index</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Whether to include the index values in the JSON string. Not
including the index (<code class="docutils literal notranslate"><span class="pre">index=False</span></code>) is only supported when
orient is ‘split’ or ‘table’.</li>
<li><strong>indent</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>optional</em>) – <p>Length of whitespace used to indent each record.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
</li>
<li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g.
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib</span></code> as header options. For other URLs (e.g.
starting with “s3://”, and “gcs://”) the key-value pairs are forwarded to
<code class="docutils literal notranslate"><span class="pre">fsspec</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more details.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">If path_or_buf is None, returns the resulting json format as a
string. Otherwise returns None.</p>
</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.10)">None</a> or <a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)">str</a></p>
</td>
</tr>
</tbody>
</table>
<p class="rubric">Differences from pandas</p>
<p>This operation has no known divergences from the pandas API.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<dl class="last docutils">
<dt><a class="reference internal" href="#apache_beam.dataframe.io.read_json" title="apache_beam.dataframe.io.read_json"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_json()</span></code></a></dt>
<dd>Convert a JSON string to pandas object.</dd>
</dl>
</div>
<p class="rubric">Notes</p>
<p>The behavior of <code class="docutils literal notranslate"><span class="pre">indent=0</span></code> varies from the stdlib, which does not
indent the output but does insert newlines. Currently, <code class="docutils literal notranslate"><span class="pre">indent=0</span></code>
and the default <code class="docutils literal notranslate"><span class="pre">indent=None</span></code> are equivalent in pandas, though this
may change in a future release.</p>
<p><code class="docutils literal notranslate"><span class="pre">orient='table'</span></code> contains a ‘pandas_version’ field under ‘schema’.
This stores the version of <cite>pandas</cite> used in the latest revision of the
schema.</p>
<p class="rubric">Examples</p>
<p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p>
<div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">json</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span>
<span class="gp">... </span> <span class="p">[[</span><span class="s2">&quot;a&quot;</span><span class="p">,</span> <span class="s2">&quot;b&quot;</span><span class="p">],</span> <span class="p">[</span><span class="s2">&quot;c&quot;</span><span class="p">,</span> <span class="s2">&quot;d&quot;</span><span class="p">]],</span>
<span class="gp">... </span> <span class="n">index</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;row 1&quot;</span><span class="p">,</span> <span class="s2">&quot;row 2&quot;</span><span class="p">],</span>
<span class="gp">... </span> <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;col 1&quot;</span><span class="p">,</span> <span class="s2">&quot;col 2&quot;</span><span class="p">],</span>
<span class="gp">... </span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">&quot;split&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">parsed</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">loads</span><span class="p">(</span><span class="n">result</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">parsed</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>
<span class="go">{</span>
<span class="go"> &quot;columns&quot;: [</span>
<span class="go"> &quot;col 1&quot;,</span>
<span class="go"> &quot;col 2&quot;</span>
<span class="go"> ],</span>
<span class="go"> &quot;index&quot;: [</span>
<span class="go"> &quot;row 1&quot;,</span>
<span class="go"> &quot;row 2&quot;</span>
<span class="go"> ],</span>
<span class="go"> &quot;data&quot;: [</span>
<span class="go"> [</span>
<span class="go"> &quot;a&quot;,</span>
<span class="go"> &quot;b&quot;</span>
<span class="go"> ],</span>
<span class="go"> [</span>
<span class="go"> &quot;c&quot;,</span>
<span class="go"> &quot;d&quot;</span>
<span class="go"> ]</span>
<span class="go"> ]</span>
<span class="go">}</span>
<span class="go">Encoding/decoding a Dataframe using ``&#39;records&#39;`` formatted JSON.</span>
<span class="go">Note that index labels are not preserved with this encoding.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">&quot;records&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">parsed</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">loads</span><span class="p">(</span><span class="n">result</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">parsed</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>
<span class="go">[</span>
<span class="go"> {</span>
<span class="go"> &quot;col 1&quot;: &quot;a&quot;,</span>
<span class="go"> &quot;col 2&quot;: &quot;b&quot;</span>
<span class="go"> },</span>
<span class="go"> {</span>
<span class="go"> &quot;col 1&quot;: &quot;c&quot;,</span>
<span class="go"> &quot;col 2&quot;: &quot;d&quot;</span>
<span class="go"> }</span>
<span class="go">]</span>
<span class="go">Encoding/decoding a Dataframe using ``&#39;index&#39;`` formatted JSON:</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">&quot;index&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">parsed</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">loads</span><span class="p">(</span><span class="n">result</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">parsed</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>
<span class="go">{</span>
<span class="go"> &quot;row 1&quot;: {</span>
<span class="go"> &quot;col 1&quot;: &quot;a&quot;,</span>
<span class="go"> &quot;col 2&quot;: &quot;b&quot;</span>
<span class="go"> },</span>
<span class="go"> &quot;row 2&quot;: {</span>
<span class="go"> &quot;col 1&quot;: &quot;c&quot;,</span>
<span class="go"> &quot;col 2&quot;: &quot;d&quot;</span>
<span class="go"> }</span>
<span class="go">}</span>
<span class="go">Encoding/decoding a Dataframe using ``&#39;columns&#39;`` formatted JSON:</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">&quot;columns&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">parsed</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">loads</span><span class="p">(</span><span class="n">result</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">parsed</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>
<span class="go">{</span>
<span class="go"> &quot;col 1&quot;: {</span>
<span class="go"> &quot;row 1&quot;: &quot;a&quot;,</span>
<span class="go"> &quot;row 2&quot;: &quot;c&quot;</span>
<span class="go"> },</span>
<span class="go"> &quot;col 2&quot;: {</span>
<span class="go"> &quot;row 1&quot;: &quot;b&quot;,</span>
<span class="go"> &quot;row 2&quot;: &quot;d&quot;</span>
<span class="go"> }</span>
<span class="go">}</span>
<span class="go">Encoding/decoding a Dataframe using ``&#39;values&#39;`` formatted JSON:</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">&quot;values&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">parsed</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">loads</span><span class="p">(</span><span class="n">result</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">parsed</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>
<span class="go">[</span>
<span class="go"> [</span>
<span class="go"> &quot;a&quot;,</span>
<span class="go"> &quot;b&quot;</span>
<span class="go"> ],</span>
<span class="go"> [</span>
<span class="go"> &quot;c&quot;,</span>
<span class="go"> &quot;d&quot;</span>
<span class="go"> ]</span>
<span class="go">]</span>
<span class="go">Encoding with Table Schema:</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">&quot;table&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">parsed</span> <span class="o">=</span> <span class="n">json</span><span class="o">.</span><span class="n">loads</span><span class="p">(</span><span class="n">result</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">parsed</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>
<span class="go">{</span>
<span class="go"> &quot;schema&quot;: {</span>
<span class="go"> &quot;fields&quot;: [</span>
<span class="go"> {</span>
<span class="go"> &quot;name&quot;: &quot;index&quot;,</span>
<span class="go"> &quot;type&quot;: &quot;string&quot;</span>
<span class="go"> },</span>
<span class="go"> {</span>
<span class="go"> &quot;name&quot;: &quot;col 1&quot;,</span>
<span class="go"> &quot;type&quot;: &quot;string&quot;</span>
<span class="go"> },</span>
<span class="go"> {</span>
<span class="go"> &quot;name&quot;: &quot;col 2&quot;,</span>
<span class="go"> &quot;type&quot;: &quot;string&quot;</span>
<span class="go"> }</span>
<span class="go"> ],</span>
<span class="go"> &quot;primaryKey&quot;: [</span>
<span class="go"> &quot;index&quot;</span>
<span class="go"> ],</span>
<span class="go"> &quot;pandas_version&quot;: &quot;0.20.0&quot;</span>
<span class="go"> },</span>
<span class="go"> &quot;data&quot;: [</span>
<span class="go"> {</span>
<span class="go"> &quot;index&quot;: &quot;row 1&quot;,</span>
<span class="go"> &quot;col 1&quot;: &quot;a&quot;,</span>
<span class="go"> &quot;col 2&quot;: &quot;b&quot;</span>
<span class="go"> },</span>
<span class="go"> {</span>
<span class="go"> &quot;index&quot;: &quot;row 2&quot;,</span>
<span class="go"> &quot;col 1&quot;: &quot;c&quot;,</span>
<span class="go"> &quot;col 2&quot;: &quot;d&quot;</span>
<span class="go"> }</span>
<span class="go"> ]</span>
<span class="go">}</span>
</pre></div>
</div>
</dd></dl>
<dl class="function">
<dt id="apache_beam.dataframe.io.read_html">
<code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_html</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#read_html"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.read_html" title="Permalink to this definition"></a></dt>
<dd><p>Read HTML tables into a <code class="docutils literal notranslate"><span class="pre">list</span></code> of <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code> objects.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>io</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>path object</em><em> or </em><em>file-like object</em>) – A URL, a file-like object, or a raw string containing HTML. Note that
lxml only accepts the http, ftp and file url protocols. If you have a
URL that starts with <code class="docutils literal notranslate"><span class="pre">'https'</span></code> you might try removing the <code class="docutils literal notranslate"><span class="pre">'s'</span></code>.</li>
<li><strong>match</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em> or </em><em>compiled regular expression</em><em>, </em><em>optional</em>) – The set of tables containing text matching this regex or string will be
returned. Unless the HTML is extremely simple you will probably need to
pass a non-empty string here. Defaults to ‘.+’ (match any non-empty
string). The default value will return all tables contained on a page.
This value is converted to a regular expression so that there is
consistent behavior between Beautiful Soup and lxml.</li>
<li><strong>flavor</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em>) – The parsing engine to use. ‘bs4’ and ‘html5lib’ are synonymous with
each other, they are both there for backwards compatibility. The
default of <code class="docutils literal notranslate"><span class="pre">None</span></code> tries to use <code class="docutils literal notranslate"><span class="pre">lxml</span></code> to parse and if that fails it
falls back on <code class="docutils literal notranslate"><span class="pre">bs4</span></code> + <code class="docutils literal notranslate"><span class="pre">html5lib</span></code>.</li>
<li><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em> or </em><em>list-like</em><em>, </em><em>optional</em>) – The row (or list of rows for a <a class="reference external" href="https://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.MultiIndex.html#pandas.MultiIndex" title="(in pandas v1.5.0.dev0+279.g7651c08230)"><code class="xref py py-class docutils literal notranslate"><span class="pre">MultiIndex</span></code></a>) to use to
make the columns headers.</li>
<li><strong>index_col</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em> or </em><em>list-like</em><em>, </em><em>optional</em>) – The column (or list of columns) to use to create the index.</li>
<li><strong>skiprows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>list-like</em><em> or </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#slice" title="(in Python v3.10)"><em>slice</em></a><em>, </em><em>optional</em>) – Number of rows to skip after parsing the column integer. 0-based. If a
sequence of integers or a slice is given, will skip the rows indexed by
that sequence. Note that a single element sequence means ‘skip the nth
row’ whereas an integer means ‘skip n rows’.</li>
<li><strong>attrs</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>This is a dictionary of attributes that you can pass to use to identify
the table in the HTML. These are not checked for validity before being
passed to lxml or Beautiful Soup. However, these attributes must be
valid HTML table attributes to work correctly. For example,</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">attrs</span> <span class="o">=</span> <span class="p">{</span><span class="s1">&#39;id&#39;</span><span class="p">:</span> <span class="s1">&#39;table&#39;</span><span class="p">}</span>
</pre></div>
</div>
<p>is a valid attribute dictionary because the ‘id’ HTML tag attribute is
a valid HTML attribute for <em>any</em> HTML tag as per <a class="reference external" href="https://html.spec.whatwg.org/multipage/dom.html#global-attributes">this document</a>.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">attrs</span> <span class="o">=</span> <span class="p">{</span><span class="s1">&#39;asdf&#39;</span><span class="p">:</span> <span class="s1">&#39;table&#39;</span><span class="p">}</span>
</pre></div>
</div>
<p>is <em>not</em> a valid attribute dictionary because ‘asdf’ is not a valid
HTML attribute even if it is a valid XML attribute. Valid HTML 4.01
table attributes can be found <a class="reference external" href="http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2">here</a>. A
working draft of the HTML 5 spec can be found <a class="reference external" href="https://html.spec.whatwg.org/multipage/tables.html">here</a>. It contains the
latest information on table attributes for the modern web.</p>
</li>
<li><strong>parse_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>optional</em>) – See <a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a> for more details.</li>
<li><strong>thousands</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em>) – Separator to use to parse thousands. Defaults to <code class="docutils literal notranslate"><span class="pre">','</span></code>.</li>
<li><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em>) – The encoding used to decode the web page. Defaults to <code class="docutils literal notranslate"><span class="pre">None</span></code>.``None``
preserves the previous encoding behavior, which depends on the
underlying parser library (e.g., the parser library will try to use
the encoding provided by the document).</li>
<li><strong>decimal</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default '.'</em>) – Character to recognize as decimal point (e.g. use ‘,’ for European
data).</li>
<li><strong>converters</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>default None</em>) – Dict of functions for converting values in certain columns. Keys can
either be integers or column labels, values are functions that take one
input argument, the cell (not column) content, and return the
transformed content.</li>
<li><strong>na_values</strong> (<em>iterable</em><em>, </em><em>default None</em>) – Custom NA values.</li>
<li><strong>keep_default_na</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – If na_values are specified and keep_default_na is False the default NaN
values are overridden, otherwise they’re appended to.</li>
<li><strong>displayed_only</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Whether elements with “display: none” should be parsed.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">A list of DeferredDataFrames.</p>
</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last">dfs</p>
</td>
</tr>
</tbody>
</table>
<p class="rubric">Differences from pandas</p>
<p>This operation has no known divergences from the pandas API.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<dl class="last docutils">
<dt><a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a></dt>
<dd>Read a comma-separated values (csv) file into DeferredDataFrame.</dd>
</dl>
</div>
<p class="rubric">Notes</p>
<p>Before using this function you should read the <a class="reference external" href="https://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#io-html-gotchas" title="(in pandas v1.5.0.dev0+279.g7651c08230)"><span class="xref std std-ref">gotchas about the
HTML parsing libraries</span></a>.</p>
<p>Expect to do some cleanup after you call this function. For example, you
might need to manually assign column names if the column names are
converted to NaN when you pass the <cite>header=0</cite> argument. We try to assume as
little as possible about the structure of the table and push the
idiosyncrasies of the HTML contained in the table to the user.</p>
<p>This function searches for <code class="docutils literal notranslate"><span class="pre">&lt;table&gt;</span></code> elements and only for <code class="docutils literal notranslate"><span class="pre">&lt;tr&gt;</span></code>
and <code class="docutils literal notranslate"><span class="pre">&lt;th&gt;</span></code> rows and <code class="docutils literal notranslate"><span class="pre">&lt;td&gt;</span></code> elements within each <code class="docutils literal notranslate"><span class="pre">&lt;tr&gt;</span></code> or <code class="docutils literal notranslate"><span class="pre">&lt;th&gt;</span></code>
element in the table. <code class="docutils literal notranslate"><span class="pre">&lt;td&gt;</span></code> stands for “table data”. This function
attempts to properly handle <code class="docutils literal notranslate"><span class="pre">colspan</span></code> and <code class="docutils literal notranslate"><span class="pre">rowspan</span></code> attributes.
If the function has a <code class="docutils literal notranslate"><span class="pre">&lt;thead&gt;</span></code> argument, it is used to construct
the header, otherwise the function attempts to find the header within
the body (by putting rows with only <code class="docutils literal notranslate"><span class="pre">&lt;th&gt;</span></code> elements into the header).</p>
<p>Similar to <a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a> the <cite>header</cite> argument is applied
<strong>after</strong> <cite>skiprows</cite> is applied.</p>
<p>This function will <em>always</em> return a list of <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> <em>or</em>
it will fail, e.g., it will <em>not</em> return an empty list.</p>
<p class="rubric">Examples</p>
<p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p>
<div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="go">See the :ref:`read_html documentation in the IO section of the docs</span>
<span class="go">&lt;io.read_html&gt;` for some examples of reading in HTML tables.</span>
</pre></div>
</div>
</dd></dl>
<dl class="function">
<dt id="apache_beam.dataframe.io.to_html">
<code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">to_html</code><span class="sig-paren">(</span><em>df</em>, <em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#to_html"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.to_html" title="Permalink to this definition"></a></dt>
<dd><p>Render a DataFrame as an HTML table.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>buf</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>Path</em><em> or </em><em>StringIO-like</em><em>, </em><em>optional</em><em>, </em><em>default None</em>) – Buffer to write to. If None, the output is returned as a string.</li>
<li><strong>columns</strong> (<em>sequence</em><em>, </em><em>optional</em><em>, </em><em>default None</em>) – The subset of columns to write. Writes all columns by default.</li>
<li><strong>col_space</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.10)"><em>list</em></a><em> or </em><em>dict of int</em><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em>) – <p>The minimum width of each column in CSS length units. An int is assumed to be px units.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.25.0: </span>Ability to use str.</p>
</div>
</li>
<li><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>optional</em>) – Whether to print column labels, default True.</li>
<li><strong>index</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>optional</em><em>, </em><em>default True</em>) – Whether to print index (row) labels.</li>
<li><strong>na_rep</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em><em>, </em><em>default 'NaN'</em>) – String representation of <code class="docutils literal notranslate"><span class="pre">NaN</span></code> to use.</li>
<li><strong>formatters</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.10)"><em>list</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.10)"><em>tuple</em></a><em> or </em><em>dict of one-param. functions</em><em>, </em><em>optional</em>) – Formatter functions to apply to columns’ elements by position or
name.
The result of each function must be a unicode string.
List/tuple must be of length equal to the number of columns.</li>
<li><strong>float_format</strong> (<em>one-parameter function</em><em>, </em><em>optional</em><em>, </em><em>default None</em>) – <p>Formatter function to apply to columns’ elements if they are
floats. This function must return a unicode string and will be
applied only to the non-<code class="docutils literal notranslate"><span class="pre">NaN</span></code> elements, with <code class="docutils literal notranslate"><span class="pre">NaN</span></code> being
handled by <code class="docutils literal notranslate"><span class="pre">na_rep</span></code>.</p>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.2.0.</span></p>
</div>
</li>
<li><strong>sparsify</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>optional</em><em>, </em><em>default True</em>) – Set to False for a DeferredDataFrame with a hierarchical index to print
every multiindex key at each row.</li>
<li><strong>index_names</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>optional</em><em>, </em><em>default True</em>) – Prints the names of the indexes.</li>
<li><strong>justify</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default None</em>) – <p>How to justify the column labels. If None uses the option from
the print configuration (controlled by set_option), ‘right’ out
of the box. Valid values are</p>
<ul>
<li>left</li>
<li>right</li>
<li>center</li>
<li>justify</li>
<li>justify-all</li>
<li>start</li>
<li>end</li>
<li>inherit</li>
<li>match-parent</li>
<li>initial</li>
<li>unset.</li>
</ul>
</li>
<li><strong>max_rows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>optional</em>) – Maximum number of rows to display in the console.</li>
<li><strong>min_rows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>optional</em>) – The number of rows to display in the console in a truncated repr
(when number of rows is above <cite>max_rows</cite>).</li>
<li><strong>max_cols</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>optional</em>) – Maximum number of columns to display in the console.</li>
<li><strong>show_dimensions</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – Display DeferredDataFrame dimensions (number of rows by number of columns).</li>
<li><strong>decimal</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default '.'</em>) – Character recognized as decimal separator, e.g. ‘,’ in Europe.</li>
<li><strong>bold_rows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Make the row labels bold in the output.</li>
<li><strong>classes</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.10)"><em>list</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.10)"><em>tuple</em></a><em>, </em><em>default None</em>) – CSS class(es) to apply to the resulting html table.</li>
<li><strong>escape</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Convert the characters &lt;, &gt;, and &amp; to HTML-safe sequences.</li>
<li><strong>notebook</strong> (<em>{True</em><em>, </em><em>False}</em><em>, </em><em>default False</em>) – Whether the generated HTML is for IPython Notebook.</li>
<li><strong>border</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a>) – A <code class="docutils literal notranslate"><span class="pre">border=border</span></code> attribute is included in the opening
<cite>&lt;table&gt;</cite> tag. Default <code class="docutils literal notranslate"><span class="pre">pd.options.display.html.border</span></code>.</li>
<li><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default &quot;utf-8&quot;</em>) – <p>Set character encoding.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.0.</span></p>
</div>
</li>
<li><strong>table_id</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em>) – A css id is included in the opening <cite>&lt;table&gt;</cite> tag if specified.</li>
<li><strong>render_links</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – Convert URLs to HTML links.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">If buf is None, returns the result as a string. Otherwise returns
None.</p>
</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)">str</a> or <a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.10)">None</a></p>
</td>
</tr>
</tbody>
</table>
<p class="rubric">Differences from pandas</p>
<p>This operation has no known divergences from the pandas API.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<dl class="last docutils">
<dt><code class="xref py py-func docutils literal notranslate"><span class="pre">to_string()</span></code></dt>
<dd>Convert DeferredDataFrame to a string.</dd>
</dl>
</div>
</dd></dl>
<dl class="function">
<dt id="apache_beam.dataframe.io.read_excel">
<code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_excel</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.read_excel" title="Permalink to this definition"></a></dt>
<dd><p>Read an Excel file into a pandas DataFrame.</p>
<p>Supports <cite>xls</cite>, <cite>xlsx</cite>, <cite>xlsm</cite>, <cite>xlsb</cite>, <cite>odf</cite>, <cite>ods</cite> and <cite>odt</cite> file extensions
read from a local filesystem or URL. Supports an option to read
a single sheet or a list of sheets.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>io</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#bytes" title="(in Python v3.10)"><em>bytes</em></a><em>, </em><em>ExcelFile</em><em>, </em><em>xlrd.Book</em><em>, </em><em>path object</em><em>, or </em><em>file-like object</em>) – <p>Any valid string path is acceptable. The string could be a URL. Valid
URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be: <code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.xlsx</span></code>.</p>
<p>If you want to pass in a path object, pandas accepts any <code class="docutils literal notranslate"><span class="pre">os.PathLike</span></code>.</p>
<p>By file-like object, we refer to objects with a <code class="docutils literal notranslate"><span class="pre">read()</span></code> method,
such as a file handle (e.g. via builtin <code class="docutils literal notranslate"><span class="pre">open</span></code> function)
or <code class="docutils literal notranslate"><span class="pre">StringIO</span></code>.</p>
</li>
<li><strong>sheet_name</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.10)"><em>list</em></a><em>, or </em><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.10)"><em>None</em></a><em>, </em><em>default 0</em>) – <p>Strings are used for sheet names. Integers are used in zero-indexed
sheet positions. Lists of strings/integers are used to request
multiple sheets. Specify None to get all sheets.</p>
<p>Available cases:</p>
<ul>
<li>Defaults to <code class="docutils literal notranslate"><span class="pre">0</span></code>: 1st sheet as a <cite>DeferredDataFrame</cite></li>
<li><code class="docutils literal notranslate"><span class="pre">1</span></code>: 2nd sheet as a <cite>DeferredDataFrame</cite></li>
<li><code class="docutils literal notranslate"><span class="pre">&quot;Sheet1&quot;</span></code>: Load sheet with name “Sheet1”</li>
<li><code class="docutils literal notranslate"><span class="pre">[0,</span> <span class="pre">1,</span> <span class="pre">&quot;Sheet5&quot;]</span></code>: Load first, second and sheet named “Sheet5”
as a dict of <cite>DeferredDataFrame</cite></li>
<li>None: All sheets.</li>
</ul>
</li>
<li><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>list of int</em><em>, </em><em>default 0</em>) – Row (0-indexed) to use for the column labels of the parsed
DeferredDataFrame. If a list of integers is passed those row positions will
be combined into a <code class="docutils literal notranslate"><span class="pre">MultiIndex</span></code>. Use None if there is no header.</li>
<li><strong>names</strong> (<em>array-like</em><em>, </em><em>default None</em>) – List of column names to use. If file contains no header row,
then you should explicitly pass header=None.</li>
<li><strong>index_col</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>list of int</em><em>, </em><em>default None</em>) – Column (0-indexed) to use as the row labels of the DeferredDataFrame.
Pass None if there is no such column. If a list is passed,
those columns will be combined into a <code class="docutils literal notranslate"><span class="pre">MultiIndex</span></code>. If a
subset of data is selected with <code class="docutils literal notranslate"><span class="pre">usecols</span></code>, index_col
is based on the subset.</li>
<li><strong>usecols</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>list-like</em><em>, or </em><em>callable default None</em>) – <ul>
<li>If None, then parse all columns.</li>
<li>If str, then indicates comma separated list of Excel column letters
and column ranges (e.g. “A:E” or “A,C,E:F”). Ranges are inclusive of
both sides.</li>
<li>If list of int, then indicates list of column numbers to be parsed.</li>
<li>If list of string, then indicates list of column names to be parsed.</li>
<li>If callable, then evaluate each column name against it and parse the
column if the callable returns <code class="docutils literal notranslate"><span class="pre">True</span></code>.</li>
</ul>
<p>Returns a subset of the columns according to behavior above.</p>
</li>
<li><strong>squeeze</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – If the parsed data only contains one column then return a DeferredSeries.</li>
<li><strong>dtype</strong> (<em>Type name</em><em> or </em><em>dict of column -&gt; type</em><em>, </em><em>default None</em>) – Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32}
Use <cite>object</cite> to preserve data as stored in Excel and not interpret dtype.
If converters are specified, they will be applied INSTEAD
of dtype conversion.</li>
<li><strong>engine</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default None</em>) – <p>If io is not a buffer or path, this must be set to identify io.
Supported engines: “xlrd”, “openpyxl”, “odf”, “pyxlsb”.
Engine compatibility :</p>
<ul>
<li>”xlrd” supports old-style Excel files (.xls).</li>
<li>”openpyxl” supports newer Excel file formats.</li>
<li>”odf” supports OpenDocument file formats (.odf, .ods, .odt).</li>
<li>”pyxlsb” supports Binary Excel files.</li>
</ul>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.2.0: </span>The engine <a class="reference external" href="https://xlrd.readthedocs.io/en/latest/">xlrd</a>
now only supports old-style <code class="docutils literal notranslate"><span class="pre">.xls</span></code> files.
When <code class="docutils literal notranslate"><span class="pre">engine=None</span></code>, the following logic will be
used to determine the engine:</p>
<ul>
<li>If <code class="docutils literal notranslate"><span class="pre">path_or_buffer</span></code> is an OpenDocument format (.odf, .ods, .odt),
then <a class="reference external" href="https://pypi.org/project/odfpy/">odf</a> will be used.</li>
<li>Otherwise if <code class="docutils literal notranslate"><span class="pre">path_or_buffer</span></code> is an xls format,
<code class="docutils literal notranslate"><span class="pre">xlrd</span></code> will be used.</li>
<li>Otherwise if <code class="docutils literal notranslate"><span class="pre">path_or_buffer</span></code> is in xlsb format,
<code class="docutils literal notranslate"><span class="pre">pyxlsb</span></code> will be used.<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</li>
<li>Otherwise <code class="docutils literal notranslate"><span class="pre">openpyxl</span></code> will be used.<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.3.0.</span></p>
</div>
</li>
</ul>
</div>
</li>
<li><strong>converters</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>default None</em>) – Dict of functions for converting values in certain columns. Keys can
either be integers or column labels, values are functions that take one
input argument, the Excel cell content, and return the transformed
content.</li>
<li><strong>true_values</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.10)"><em>list</em></a><em>, </em><em>default None</em>) – Values to consider as True.</li>
<li><strong>false_values</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.10)"><em>list</em></a><em>, </em><em>default None</em>) – Values to consider as False.</li>
<li><strong>skiprows</strong> (<em>list-like</em><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, or </em><em>callable</em><em>, </em><em>optional</em>) – Line numbers to skip (0-indexed) or number of lines to skip (int) at the
start of the file. If callable, the callable function will be evaluated
against the row indices, returning True if the row should be skipped and
False otherwise. An example of a valid callable argument would be <code class="docutils literal notranslate"><span class="pre">lambda</span>
<span class="pre">x:</span> <span class="pre">x</span> <span class="pre">in</span> <span class="pre">[0,</span> <span class="pre">2]</span></code>.</li>
<li><strong>nrows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>default None</em>) – Number of rows to parse.</li>
<li><strong>na_values</strong> (<em>scalar</em><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>list-like</em><em>, or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>default None</em>) – Additional strings to recognize as NA/NaN. If dict passed, specific
per-column NA values. By default the following values are interpreted
as NaN: ‘’, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, ‘-nan’,
‘1.#IND’, ‘1.#QNAN’, ‘&lt;NA&gt;’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘n/a’,
‘nan’, ‘null’.</li>
<li><strong>keep_default_na</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – <p>Whether or not to include the default NaN values when parsing the data.
Depending on whether <cite>na_values</cite> is passed in, the behavior is as follows:</p>
<ul>
<li>If <cite>keep_default_na</cite> is True, and <cite>na_values</cite> are specified, <cite>na_values</cite>
is appended to the default NaN values used for parsing.</li>
<li>If <cite>keep_default_na</cite> is True, and <cite>na_values</cite> are not specified, only
the default NaN values are used for parsing.</li>
<li>If <cite>keep_default_na</cite> is False, and <cite>na_values</cite> are specified, only
the NaN values specified <cite>na_values</cite> are used for parsing.</li>
<li>If <cite>keep_default_na</cite> is False, and <cite>na_values</cite> are not specified, no
strings will be parsed as NaN.</li>
</ul>
<p>Note that if <cite>na_filter</cite> is passed in as False, the <cite>keep_default_na</cite> and
<cite>na_values</cite> parameters will be ignored.</p>
</li>
<li><strong>na_filter</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Detect missing value markers (empty strings and the value of na_values). In
data without any NAs, passing na_filter=False can improve the performance
of reading a large file.</li>
<li><strong>verbose</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – Indicate number of NA values placed in non-numeric columns.</li>
<li><strong>parse_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>list-like</em><em>, or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>default False</em>) – <p>The behavior is as follows:</p>
<ul>
<li>bool. If True -&gt; try parsing the index.</li>
<li>list of int or names. e.g. If [1, 2, 3] -&gt; try parsing columns 1, 2, 3
each as a separate date column.</li>
<li>list of lists. e.g. If [[1, 3]] -&gt; combine columns 1 and 3 and parse as
a single date column.</li>
<li>dict, e.g. {‘foo’ : [1, 3]} -&gt; parse columns 1, 3 as date and call
result ‘foo’</li>
</ul>
<p>If a column or index contains an unparsable date, the entire column or
index will be returned unaltered as an object data type. If you don`t want to
parse some cells as date just change their type in Excel to “Text”.
For non-standard datetime parsing, use <code class="docutils literal notranslate"><span class="pre">pd.to_datetime</span></code> after <code class="docutils literal notranslate"><span class="pre">pd.read_excel</span></code>.</p>
<p>Note: A fast-path exists for iso8601-formatted dates.</p>
</li>
<li><strong>date_parser</strong> (<em>function</em><em>, </em><em>optional</em>) – Function to use for converting a sequence of string columns to an array of
datetime instances. The default uses <code class="docutils literal notranslate"><span class="pre">dateutil.parser.parser</span></code> to do the
conversion. Pandas will try to call <cite>date_parser</cite> in three different ways,
advancing to the next if an exception occurs: 1) Pass one or more arrays
(as defined by <cite>parse_dates</cite>) as arguments; 2) concatenate (row-wise) the
string values from the columns defined by <cite>parse_dates</cite> into a single array
and pass that; and 3) call <cite>date_parser</cite> once for each row using one or
more strings (corresponding to the columns defined by <cite>parse_dates</cite>) as
arguments.</li>
<li><strong>thousands</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default None</em>) – Thousands separator for parsing string columns to numeric. Note that
this parameter is only necessary for columns stored as TEXT in Excel,
any numeric columns will automatically be parsed, regardless of display
format.</li>
<li><strong>comment</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default None</em>) – Comments out remainder of line. Pass a character or characters to this
argument to indicate comments in the input file. Any data between the
comment string and the end of the current line is ignored.</li>
<li><strong>skipfooter</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>default 0</em>) – Rows at the end to skip (0-indexed).</li>
<li><strong>convert_float</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – <p>Convert integral floats to int (i.e., 1.0 –&gt; 1). If False, all numeric
data will be read in as floats: Excel stores all numbers as floats
internally.</p>
<div class="deprecated">
<p><span class="versionmodified">Deprecated since version 1.3.0: </span>convert_float will be removed in a future version</p>
</div>
</li>
<li><strong>mangle_dupe_cols</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Duplicate columns will be specified as ‘X’, ‘X.1’, …’X.N’, rather than
‘X’…’X’. Passing in False will cause data to be overwritten if there
are duplicate names in the columns.</li>
<li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g.
host, port, username, password, etc., if using a URL that will
be parsed by <code class="docutils literal notranslate"><span class="pre">fsspec</span></code>, e.g., starting “s3://”, “gcs://”. An error
will be raised if providing this argument with a local path or
a file-like buffer. See the fsspec and backend storage implementation
docs for the set of allowed keys and values.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">DeferredDataFrame from the passed in Excel file. See notes in sheet_name
argument for more information on when a dict of DeferredDataFrames is returned.</p>
</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a> or dict of DeferredDataFrames</p>
</td>
</tr>
</tbody>
</table>
<p class="rubric">Differences from pandas</p>
<p>This operation has no known divergences from the pandas API.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<dl class="last docutils">
<dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_excel()</span></code></dt>
<dd>Write DeferredDataFrame to an Excel file.</dd>
<dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_csv()</span></code></dt>
<dd>Write DeferredDataFrame to a comma-separated values (csv) file.</dd>
<dt><a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a></dt>
<dd>Read a comma-separated values (csv) file into DeferredDataFrame.</dd>
<dt><a class="reference internal" href="#apache_beam.dataframe.io.read_fwf" title="apache_beam.dataframe.io.read_fwf"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_fwf()</span></code></a></dt>
<dd>Read a table of fixed-width formatted lines into DeferredDataFrame.</dd>
</dl>
</div>
<p class="rubric">Examples</p>
<p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p>
<div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="go">The file can be read using the file name as string or an open file object:</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="s1">&#39;tmp.xlsx&#39;</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
<span class="go"> Name Value</span>
<span class="go">0 string1 1</span>
<span class="go">1 string2 2</span>
<span class="go">2 #Comment 3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="nb">open</span><span class="p">(</span><span class="s1">&#39;tmp.xlsx&#39;</span><span class="p">,</span> <span class="s1">&#39;rb&#39;</span><span class="p">),</span>
<span class="gp">... </span> <span class="n">sheet_name</span><span class="o">=</span><span class="s1">&#39;Sheet3&#39;</span><span class="p">)</span>
<span class="go"> Unnamed: 0 Name Value</span>
<span class="go">0 0 string1 1</span>
<span class="go">1 1 string2 2</span>
<span class="go">2 2 #Comment 3</span>
<span class="go">Index and header can be specified via the `index_col` and `header` arguments</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="s1">&#39;tmp.xlsx&#39;</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
<span class="go"> 0 1 2</span>
<span class="go">0 NaN Name Value</span>
<span class="go">1 0.0 string1 1</span>
<span class="go">2 1.0 string2 2</span>
<span class="go">3 2.0 #Comment 3</span>
<span class="go">Column types are inferred but can be explicitly specified</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="s1">&#39;tmp.xlsx&#39;</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">dtype</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;Name&#39;</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="s1">&#39;Value&#39;</span><span class="p">:</span> <span class="nb">float</span><span class="p">})</span>
<span class="go"> Name Value</span>
<span class="go">0 string1 1.0</span>
<span class="go">1 string2 2.0</span>
<span class="go">2 #Comment 3.0</span>
<span class="go">True, False, and NA values, and thousands separators have defaults,</span>
<span class="go">but can be explicitly specified, too. Supply the values you would like</span>
<span class="go">as strings or lists of strings!</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="s1">&#39;tmp.xlsx&#39;</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">na_values</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;string1&#39;</span><span class="p">,</span> <span class="s1">&#39;string2&#39;</span><span class="p">])</span>
<span class="go"> Name Value</span>
<span class="go">0 NaN 1</span>
<span class="go">1 NaN 2</span>
<span class="go">2 #Comment 3</span>
<span class="go">Comment lines in the excel input file can be skipped using the `comment` kwarg</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="s1">&#39;tmp.xlsx&#39;</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">comment</span><span class="o">=</span><span class="s1">&#39;#&#39;</span><span class="p">)</span>
<span class="go"> Name Value</span>
<span class="go">0 string1 1.0</span>
<span class="go">1 string2 2.0</span>
<span class="go">2 None NaN</span>
</pre></div>
</div>
</dd></dl>
<dl class="function">
<dt id="apache_beam.dataframe.io.read_feather">
<code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_feather</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.read_feather" title="Permalink to this definition"></a></dt>
<dd><p>Load a feather-format object from the file path.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>path object</em><em> or </em><em>file-like object</em>) – <p>Any valid string path is acceptable. The string could be a URL. Valid
URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be:
<code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.feather</span></code>.</p>
<p>If you want to pass in a path object, pandas accepts any
<code class="docutils literal notranslate"><span class="pre">os.PathLike</span></code>.</p>
<p>By file-like object, we refer to objects with a <code class="docutils literal notranslate"><span class="pre">read()</span></code> method,
such as a file handle (e.g. via builtin <code class="docutils literal notranslate"><span class="pre">open</span></code> function)
or <code class="docutils literal notranslate"><span class="pre">StringIO</span></code>.</p>
</li>
<li><strong>columns</strong> (<em>sequence</em><em>, </em><em>default None</em>) – If not provided, all columns are read.</li>
<li><strong>use_threads</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Whether to parallelize reading using multiple threads.</li>
<li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g.
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib</span></code> as header options. For other URLs (e.g.
starting with “s3://”, and “gcs://”) the key-value pairs are forwarded to
<code class="docutils literal notranslate"><span class="pre">fsspec</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more details.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first"></p>
</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last">type of object stored in file</p>
</td>
</tr>
</tbody>
</table>
<p class="rubric">Differences from pandas</p>
<p>This operation has no known divergences from the pandas API.</p>
</dd></dl>
<dl class="function">
<dt id="apache_beam.dataframe.io.read_parquet">
<code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_parquet</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.read_parquet" title="Permalink to this definition"></a></dt>
<dd><p>Load a parquet object from the file path, returning a DataFrame.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>path object</em><em> or </em><em>file-like object</em>) – <p>Any valid string path is acceptable. The string could be a URL. Valid
URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
expected. A local file could be:
<code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.parquet</span></code>.
A file URL can also be a path to a directory that contains multiple
partitioned parquet files. Both pyarrow and fastparquet support
paths to directories as well as file URLs. A directory path could be:
<code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/tables</span></code> or <code class="docutils literal notranslate"><span class="pre">s3://bucket/partition_dir</span></code></p>
<p>If you want to pass in a path object, pandas accepts any
<code class="docutils literal notranslate"><span class="pre">os.PathLike</span></code>.</p>
<p>By file-like object, we refer to objects with a <code class="docutils literal notranslate"><span class="pre">read()</span></code> method,
such as a file handle (e.g. via builtin <code class="docutils literal notranslate"><span class="pre">open</span></code> function)
or <code class="docutils literal notranslate"><span class="pre">StringIO</span></code>.</p>
</li>
<li><strong>engine</strong> (<em>{'auto'</em><em>, </em><em>'pyarrow'</em><em>, </em><em>'fastparquet'}</em><em>, </em><em>default 'auto'</em>) – Parquet library to use. If ‘auto’, then the option
<code class="docutils literal notranslate"><span class="pre">io.parquet.engine</span></code> is used. The default <code class="docutils literal notranslate"><span class="pre">io.parquet.engine</span></code>
behavior is to try ‘pyarrow’, falling back to ‘fastparquet’ if
‘pyarrow’ is unavailable.</li>
<li><strong>columns</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.10)"><em>list</em></a><em>, </em><em>default=None</em>) – If not None, only these columns will be read from the file.</li>
<li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g.
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib</span></code> as header options. For other URLs (e.g.
starting with “s3://”, and “gcs://”) the key-value pairs are forwarded to
<code class="docutils literal notranslate"><span class="pre">fsspec</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more details.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</li>
<li><strong>use_nullable_dtypes</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – <p>If True, use dtypes that use <code class="docutils literal notranslate"><span class="pre">pd.NA</span></code> as missing value indicator
for the resulting DeferredDataFrame. (only applicable for the <code class="docutils literal notranslate"><span class="pre">pyarrow</span></code>
engine)
As new dtypes are added that support <code class="docutils literal notranslate"><span class="pre">pd.NA</span></code> in the future, the
output with this option will change to use those dtypes.
Note: this is an experimental option, and behaviour (e.g. additional
support dtypes) may change without notice.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</li>
<li><strong>**kwargs</strong> – Any additional kwargs are passed to the engine.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first"></p>
</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a></p>
</td>
</tr>
</tbody>
</table>
<p class="rubric">Differences from pandas</p>
<p>This operation has no known divergences from the pandas API.</p>
</dd></dl>
<dl class="function">
<dt id="apache_beam.dataframe.io.read_sas">
<code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_sas</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.read_sas" title="Permalink to this definition"></a></dt>
<dd><p>Read SAS files stored as either XPORT or SAS7BDAT format files.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>filepath_or_buffer</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>path object</em><em> or </em><em>file-like object</em>) – <p>Any valid string path is acceptable. The string could be a URL. Valid
URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be:
<code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.sas</span></code>.</p>
<p>If you want to pass in a path object, pandas accepts any
<code class="docutils literal notranslate"><span class="pre">os.PathLike</span></code>.</p>
<p>By file-like object, we refer to objects with a <code class="docutils literal notranslate"><span class="pre">read()</span></code> method,
such as a file handle (e.g. via builtin <code class="docutils literal notranslate"><span class="pre">open</span></code> function)
or <code class="docutils literal notranslate"><span class="pre">StringIO</span></code>.</p>
</li>
<li><strong>format</strong> (<em>str {'xport'</em><em>, </em><em>'sas7bdat'}</em><em> or </em><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.10)"><em>None</em></a>) – If None, file format is inferred from file extension. If ‘xport’ or
‘sas7bdat’, uses the corresponding format.</li>
<li><strong>index</strong> (<em>identifier of index column</em><em>, </em><em>defaults to None</em>) – Identifier of column that should be used as index of the DeferredDataFrame.</li>
<li><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default is None</em>) – Encoding for text data. If None, text data are stored as raw bytes.</li>
<li><strong>chunksize</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a>) – <p>Read file <cite>chunksize</cite> lines at a time, returns iterator.</p>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.2: </span><code class="docutils literal notranslate"><span class="pre">TextFileReader</span></code> is a context manager.</p>
</div>
</li>
<li><strong>iterator</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>defaults to False</em>) – <p>If True, returns an iterator for reading the file incrementally.</p>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.2: </span><code class="docutils literal notranslate"><span class="pre">TextFileReader</span></code> is a context manager.</p>
</div>
</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last"><ul class="simple">
<li><em>DeferredDataFrame if iterator=False and chunksize=None, else SAS7BDATReader</em></li>
<li><em>or XportReader</em></li>
</ul>
</p>
</td>
</tr>
</tbody>
</table>
<p class="rubric">Differences from pandas</p>
<p>This operation has no known divergences from the pandas API.</p>
</dd></dl>
<dl class="function">
<dt id="apache_beam.dataframe.io.read_spss">
<code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_spss</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.read_spss" title="Permalink to this definition"></a></dt>
<dd><p>Load an SPSS file from the file path, returning a DataFrame.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 0.25.0.</span></p>
</div>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em> or </em><em>Path</em>) – File path.</li>
<li><strong>usecols</strong> (<em>list-like</em><em>, </em><em>optional</em>) – Return a subset of the columns. If None, return all columns.</li>
<li><strong>convert_categoricals</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default is True</em>) – Convert categorical columns into pd.Categorical.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first"></p>
</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a></p>
</td>
</tr>
</tbody>
</table>
<p class="rubric">Differences from pandas</p>
<p>This operation has no known divergences from the pandas API.</p>
</dd></dl>
<dl class="function">
<dt id="apache_beam.dataframe.io.read_stata">
<code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_stata</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.read_stata" title="Permalink to this definition"></a></dt>
<dd><p>Read Stata file into DataFrame.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>filepath_or_buffer</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>path object</em><em> or </em><em>file-like object</em>) – <p>Any valid string path is acceptable. The string could be a URL. Valid
URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be: <code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.dta</span></code>.</p>
<p>If you want to pass in a path object, pandas accepts any <code class="docutils literal notranslate"><span class="pre">os.PathLike</span></code>.</p>
<p>By file-like object, we refer to objects with a <code class="docutils literal notranslate"><span class="pre">read()</span></code> method,
such as a file handle (e.g. via builtin <code class="docutils literal notranslate"><span class="pre">open</span></code> function)
or <code class="docutils literal notranslate"><span class="pre">StringIO</span></code>.</p>
</li>
<li><strong>convert_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Convert date variables to DeferredDataFrame time values.</li>
<li><strong>convert_categoricals</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Read value labels and convert columns to Categorical/Factor variables.</li>
<li><strong>index_col</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em>) – Column to set as index.</li>
<li><strong>convert_missing</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – Flag indicating whether to convert missing values to their Stata
representations. If False, missing values are replaced with nan.
If True, columns containing missing values are returned with
object data types and missing values are represented by
StataMissingValue objects.</li>
<li><strong>preserve_dtypes</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Preserve Stata datatypes. If False, numeric data are upcast to pandas
default types for foreign data (float64 or int64).</li>
<li><strong>columns</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.10)"><em>list</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.10)"><em>None</em></a>) – Columns to retain. Columns will be returned in the given order. None
returns all columns.</li>
<li><strong>order_categoricals</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Flag indicating whether converted categorical data are ordered.</li>
<li><strong>chunksize</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>default None</em>) – Return StataReader object for iterations, returns chunks with
given number of lines.</li>
<li><strong>iterator</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default False</em>) – Return StataReader object.</li>
<li><strong>compression</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>default None</em>) – If string, specifies compression mode. If dict, value at key ‘method’
specifies compression mode. Compression mode must be one of {‘infer’,
‘gzip’, ‘bz2’, ‘zip’, ‘xz’, None}. If compression mode is ‘infer’
and <cite>filepath_or_buffer</cite> is path-like, then detect compression from
the following extensions: ‘.gz’, ‘.bz2’, ‘.zip’, or ‘.xz’ (otherwise
no compression). If dict and compression mode is one of
{‘zip’, ‘gzip’, ‘bz2’}, or inferred as one of the above,
other entries passed as additional compression options.</li>
<li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>optional</em>) – Extra options that make sense for a particular storage connection, e.g.
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib</span></code> as header options. For other URLs (e.g.
starting with “s3://”, and “gcs://”) the key-value pairs are forwarded to
<code class="docutils literal notranslate"><span class="pre">fsspec</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more details.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first"></p>
</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a> or StataReader</p>
</td>
</tr>
</tbody>
</table>
<p class="rubric">Differences from pandas</p>
<p>This operation has no known divergences from the pandas API.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<dl class="last docutils">
<dt><code class="xref py py-func docutils literal notranslate"><span class="pre">io.stata.StataReader()</span></code></dt>
<dd>Low-level reader for Stata data files.</dd>
<dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_stata()</span></code></dt>
<dd>Export Stata data files.</dd>
</dl>
</div>
<p class="rubric">Notes</p>
<p>Categorical variables read through an iterator may not have the same
categories and dtype. This occurs when a variable stored in a DTA
file is associated to an incomplete set of value labels that only
label a strict subset of the values.</p>
<p class="rubric">Examples</p>
<p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p>
<div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="go">Read a Stata dta file:</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_stata</span><span class="p">(</span><span class="s1">&#39;filename.dta&#39;</span><span class="p">)</span>
<span class="go">Read a Stata dta file in 10,000 line chunks:</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">itr</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_stata</span><span class="p">(</span><span class="s1">&#39;filename.dta&#39;</span><span class="p">,</span> <span class="n">chunksize</span><span class="o">=</span><span class="mi">10000</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">for</span> <span class="n">chunk</span> <span class="ow">in</span> <span class="n">itr</span><span class="p">:</span>
<span class="gp">... </span> <span class="n">do_something</span><span class="p">(</span><span class="n">chunk</span><span class="p">)</span>
</pre></div>
</div>
</dd></dl>
<dl class="function">
<dt id="apache_beam.dataframe.io.to_excel">
<code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">to_excel</code><span class="sig-paren">(</span><em>df</em>, <em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.to_excel" title="Permalink to this definition"></a></dt>
<dd><p>Write object to an Excel sheet.</p>
<p>To write a single object to an Excel .xlsx file it is only necessary to
specify a target file name. To write to multiple sheets it is necessary to
create an <cite>ExcelWriter</cite> object with a target file name, and specify a sheet
in the file to write to.</p>
<p>Multiple sheets may be written to by specifying unique <cite>sheet_name</cite>.
With all data written to the file it is necessary to save the changes.
Note that creating an <cite>ExcelWriter</cite> object with a file name that already
exists will result in the contents of the existing file being erased.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>excel_writer</strong> (<em>path-like</em><em>, </em><em>file-like</em><em>, or </em><em>ExcelWriter object</em>) – File path or existing ExcelWriter.</li>
<li><strong>sheet_name</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default 'Sheet1'</em>) – Name of sheet which will contain DeferredDataFrame.</li>
<li><strong>na_rep</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default ''</em>) – Missing data representation.</li>
<li><strong>float_format</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em>) – Format string for floating point numbers. For example
<code class="docutils literal notranslate"><span class="pre">float_format=&quot;%.2f&quot;</span></code> will format 0.1234 to 0.12.</li>
<li><strong>columns</strong> (<em>sequence</em><em> or </em><em>list of str</em><em>, </em><em>optional</em>) – Columns to write.</li>
<li><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em> or </em><em>list of str</em><em>, </em><em>default True</em>) – Write out the column names. If a list of string is given it is
assumed to be aliases for the column names.</li>
<li><strong>index</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Write row names (index).</li>
<li><strong>index_label</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em> or </em><em>sequence</em><em>, </em><em>optional</em>) – Column label for index column(s) if desired. If not specified, and
<cite>header</cite> and <cite>index</cite> are True, then the index names are used. A
sequence should be given if the DeferredDataFrame uses MultiIndex.</li>
<li><strong>startrow</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>default 0</em>) – Upper left cell row to dump data frame.</li>
<li><strong>startcol</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a><em>, </em><em>default 0</em>) – Upper left cell column to dump data frame.</li>
<li><strong>engine</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em>) – <p>Write engine to use, ‘openpyxl’ or ‘xlsxwriter’. You can also set this
via the options <code class="docutils literal notranslate"><span class="pre">io.excel.xlsx.writer</span></code>, <code class="docutils literal notranslate"><span class="pre">io.excel.xls.writer</span></code>, and
<code class="docutils literal notranslate"><span class="pre">io.excel.xlsm.writer</span></code>.</p>
<div class="deprecated">
<p><span class="versionmodified">Deprecated since version 1.2.0: </span>As the <a class="reference external" href="https://pypi.org/project/xlwt/">xlwt</a> package is no longer
maintained, the <code class="docutils literal notranslate"><span class="pre">xlwt</span></code> engine will be removed in a future version
of pandas.</p>
</div>
</li>
<li><strong>merge_cells</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Write MultiIndex and Hierarchical Rows as merged cells.</li>
<li><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em>) – Encoding of the resulting excel file. Only necessary for xlwt,
other writers support unicode natively.</li>
<li><strong>inf_rep</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>default 'inf'</em>) – Representation for infinity (there is no native representation for
infinity in Excel).</li>
<li><strong>verbose</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default True</em>) – Display more information in the error logs.</li>
<li><strong>freeze_panes</strong> (<em>tuple of int</em><em> (</em><em>length 2</em><em>)</em><em>, </em><em>optional</em>) – Specifies the one-based bottommost row and rightmost column that
is to be frozen.</li>
<li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g.
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib</span></code> as header options. For other URLs (e.g.
starting with “s3://”, and “gcs://”) the key-value pairs are forwarded to
<code class="docutils literal notranslate"><span class="pre">fsspec</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more details.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</li>
</ul>
</td>
</tr>
</tbody>
</table>
<p class="rubric">Differences from pandas</p>
<p>This operation has no known divergences from the pandas API.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<dl class="last docutils">
<dt><a class="reference internal" href="#apache_beam.dataframe.io.to_csv" title="apache_beam.dataframe.io.to_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">to_csv()</span></code></a></dt>
<dd>Write DeferredDataFrame to a comma-separated values (csv) file.</dd>
<dt><code class="xref py py-func docutils literal notranslate"><span class="pre">ExcelWriter()</span></code></dt>
<dd>Class for writing DeferredDataFrame objects into excel sheets.</dd>
<dt><a class="reference internal" href="#apache_beam.dataframe.io.read_excel" title="apache_beam.dataframe.io.read_excel"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_excel()</span></code></a></dt>
<dd>Read an Excel file into a pandas DeferredDataFrame.</dd>
<dt><a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a></dt>
<dd>Read a comma-separated values (csv) file into DeferredDataFrame.</dd>
</dl>
</div>
<p class="rubric">Notes</p>
<p>For compatibility with <code class="xref py py-meth docutils literal notranslate"><span class="pre">to_csv()</span></code>,
to_excel serializes lists and dicts to strings before writing.</p>
<p>Once a workbook has been saved it is not possible to write further
data without rewriting the whole workbook.</p>
<p class="rubric">Examples</p>
<p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p>
<div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="go">Create, write to and save a workbook:</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df1</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">([[</span><span class="s1">&#39;a&#39;</span><span class="p">,</span> <span class="s1">&#39;b&#39;</span><span class="p">],</span> <span class="p">[</span><span class="s1">&#39;c&#39;</span><span class="p">,</span> <span class="s1">&#39;d&#39;</span><span class="p">]],</span>
<span class="gp">... </span> <span class="n">index</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;row 1&#39;</span><span class="p">,</span> <span class="s1">&#39;row 2&#39;</span><span class="p">],</span>
<span class="gp">... </span> <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;col 1&#39;</span><span class="p">,</span> <span class="s1">&#39;col 2&#39;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df1</span><span class="o">.</span><span class="n">to_excel</span><span class="p">(</span><span class="s2">&quot;output.xlsx&quot;</span><span class="p">)</span>
<span class="go">To specify the sheet name:</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df1</span><span class="o">.</span><span class="n">to_excel</span><span class="p">(</span><span class="s2">&quot;output.xlsx&quot;</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">sheet_name</span><span class="o">=</span><span class="s1">&#39;Sheet_name_1&#39;</span><span class="p">)</span>
<span class="go">If you wish to write to more than one sheet in the workbook, it is</span>
<span class="go">necessary to specify an ExcelWriter object:</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df2</span> <span class="o">=</span> <span class="n">df1</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">with</span> <span class="n">pd</span><span class="o">.</span><span class="n">ExcelWriter</span><span class="p">(</span><span class="s1">&#39;output.xlsx&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">writer</span><span class="p">:</span>
<span class="gp">... </span> <span class="n">df1</span><span class="o">.</span><span class="n">to_excel</span><span class="p">(</span><span class="n">writer</span><span class="p">,</span> <span class="n">sheet_name</span><span class="o">=</span><span class="s1">&#39;Sheet_name_1&#39;</span><span class="p">)</span>
<span class="gp">... </span> <span class="n">df2</span><span class="o">.</span><span class="n">to_excel</span><span class="p">(</span><span class="n">writer</span><span class="p">,</span> <span class="n">sheet_name</span><span class="o">=</span><span class="s1">&#39;Sheet_name_2&#39;</span><span class="p">)</span>
<span class="go">ExcelWriter can also be used to append to an existing Excel file:</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">with</span> <span class="n">pd</span><span class="o">.</span><span class="n">ExcelWriter</span><span class="p">(</span><span class="s1">&#39;output.xlsx&#39;</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">mode</span><span class="o">=</span><span class="s1">&#39;a&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">writer</span><span class="p">:</span>
<span class="gp">... </span> <span class="n">df</span><span class="o">.</span><span class="n">to_excel</span><span class="p">(</span><span class="n">writer</span><span class="p">,</span> <span class="n">sheet_name</span><span class="o">=</span><span class="s1">&#39;Sheet_name_3&#39;</span><span class="p">)</span>
<span class="go">To set the library that is used to write the Excel file,</span>
<span class="go">you can pass the `engine` keyword (the default engine is</span>
<span class="go">automatically chosen depending on the file extension):</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df1</span><span class="o">.</span><span class="n">to_excel</span><span class="p">(</span><span class="s1">&#39;output1.xlsx&#39;</span><span class="p">,</span> <span class="n">engine</span><span class="o">=</span><span class="s1">&#39;xlsxwriter&#39;</span><span class="p">)</span>
</pre></div>
</div>
</dd></dl>
<dl class="function">
<dt id="apache_beam.dataframe.io.to_feather">
<code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">to_feather</code><span class="sig-paren">(</span><em>df</em>, <em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.to_feather" title="Permalink to this definition"></a></dt>
<dd><p>Write a DataFrame to the binary Feather format.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em> or </em><em>file-like object</em>) – If a string, it will be used as Root Directory path.</li>
<li><strong>**kwargs</strong><p>Additional keywords passed to <code class="xref py py-func docutils literal notranslate"><span class="pre">pyarrow.feather.write_feather()</span></code>.
Starting with pyarrow 0.17, this includes the <cite>compression</cite>,
<cite>compression_level</cite>, <cite>chunksize</cite> and <cite>version</cite> keywords.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
</li>
</ul>
</td>
</tr>
</tbody>
</table>
<p class="rubric">Differences from pandas</p>
<p>This operation has no known divergences from the pandas API.</p>
</dd></dl>
<dl class="function">
<dt id="apache_beam.dataframe.io.to_parquet">
<code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">to_parquet</code><span class="sig-paren">(</span><em>df</em>, <em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.to_parquet" title="Permalink to this definition"></a></dt>
<dd><p>Write a DataFrame to the binary parquet format.</p>
<p>This function writes the dataframe as a <a class="reference external" href="https://parquet.apache.org/">parquet file</a>. You can choose different parquet
backends, and have the option of compression. See
<a class="reference external" href="https://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#io-parquet" title="(in pandas v1.5.0.dev0+279.g7651c08230)"><span class="xref std std-ref">the user guide</span></a> for more details.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em> or </em><em>file-like object</em><em>, </em><em>default None</em>) – <p>If a string, it will be used as Root Directory path
when writing a partitioned dataset. By file-like object,
we refer to objects with a write() method, such as a file handle
(e.g. via builtin open function) or io.BytesIO. The engine
fastparquet does not accept file-like objects. If path is None,
a bytes object is returned.</p>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.2.0.</span></p>
</div>
<p>Previously this was “fname”</p>
</li>
<li><strong>engine</strong> (<em>{'auto'</em><em>, </em><em>'pyarrow'</em><em>, </em><em>'fastparquet'}</em><em>, </em><em>default 'auto'</em>) – Parquet library to use. If ‘auto’, then the option
<code class="docutils literal notranslate"><span class="pre">io.parquet.engine</span></code> is used. The default <code class="docutils literal notranslate"><span class="pre">io.parquet.engine</span></code>
behavior is to try ‘pyarrow’, falling back to ‘fastparquet’ if
‘pyarrow’ is unavailable.</li>
<li><strong>compression</strong> (<em>{'snappy'</em><em>, </em><em>'gzip'</em><em>, </em><em>'brotli'</em><em>, </em><em>None}</em><em>, </em><em>default 'snappy'</em>) – Name of the compression to use. Use <code class="docutils literal notranslate"><span class="pre">None</span></code> for no compression.</li>
<li><strong>index</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a><em>, </em><em>default None</em>) – If <code class="docutils literal notranslate"><span class="pre">True</span></code>, include the dataframe’s index(es) in the file output.
If <code class="docutils literal notranslate"><span class="pre">False</span></code>, they will not be written to the file.
If <code class="docutils literal notranslate"><span class="pre">None</span></code>, similar to <code class="docutils literal notranslate"><span class="pre">True</span></code> the dataframe’s index(es)
will be saved. However, instead of being saved as values,
the RangeIndex will be stored as a range in the metadata so it
doesn’t require much space and is faster. Other indexes will
be included as columns in the file output.</li>
<li><strong>partition_cols</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.10)"><em>list</em></a><em>, </em><em>optional</em><em>, </em><em>default None</em>) – Column names by which to partition the dataset.
Columns are partitioned in the order they are given.
Must be None if path is not a string.</li>
<li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g.
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib</span></code> as header options. For other URLs (e.g.
starting with “s3://”, and “gcs://”) the key-value pairs are forwarded to
<code class="docutils literal notranslate"><span class="pre">fsspec</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more details.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</li>
<li><strong>**kwargs</strong> – Additional arguments passed to the parquet library. See
<a class="reference external" href="https://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#io-parquet" title="(in pandas v1.5.0.dev0+279.g7651c08230)"><span class="xref std std-ref">pandas io</span></a> for more details.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first"></p>
</td>
</tr>
<tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last">bytes if no path argument is provided else None</p>
</td>
</tr>
</tbody>
</table>
<p class="rubric">Differences from pandas</p>
<p>This operation has no known divergences from the pandas API.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<dl class="last docutils">
<dt><a class="reference internal" href="#apache_beam.dataframe.io.read_parquet" title="apache_beam.dataframe.io.read_parquet"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_parquet()</span></code></a></dt>
<dd>Read a parquet file.</dd>
<dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_csv()</span></code></dt>
<dd>Write a csv file.</dd>
<dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_sql()</span></code></dt>
<dd>Write to a sql table.</dd>
<dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_hdf()</span></code></dt>
<dd>Write to hdf.</dd>
</dl>
</div>
<p class="rubric">Notes</p>
<p>This function requires either the <a class="reference external" href="https://pypi.org/project/fastparquet">fastparquet</a> or <a class="reference external" href="https://arrow.apache.org/docs/python/">pyarrow</a> library.</p>
<p class="rubric">Examples</p>
<p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p>
<div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;col1&#39;</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="s1">&#39;col2&#39;</span><span class="p">:</span> <span class="p">[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">]})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="o">.</span><span class="n">to_parquet</span><span class="p">(</span><span class="s1">&#39;df.parquet.gzip&#39;</span><span class="p">,</span>
<span class="gp">... </span> <span class="n">compression</span><span class="o">=</span><span class="s1">&#39;gzip&#39;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">pd</span><span class="o">.</span><span class="n">read_parquet</span><span class="p">(</span><span class="s1">&#39;df.parquet.gzip&#39;</span><span class="p">)</span>
<span class="go"> col1 col2</span>
<span class="go">0 1 3</span>
<span class="go">1 2 4</span>
<span class="go">If you want to get a buffer to the parquet content you can use a io.BytesIO</span>
<span class="go">object, as long as you don&#39;t use partition_cols, which creates multiple files.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">io</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">f</span> <span class="o">=</span> <span class="n">io</span><span class="o">.</span><span class="n">BytesIO</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="o">.</span><span class="n">to_parquet</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">f</span><span class="o">.</span><span class="n">seek</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
<span class="go">0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">content</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">read</span><span class="p">()</span>
</pre></div>
</div>
</dd></dl>
<dl class="function">
<dt id="apache_beam.dataframe.io.to_stata">
<code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">to_stata</code><span class="sig-paren">(</span><em>df</em>, <em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.to_stata" title="Permalink to this definition"></a></dt>
<dd><p>Export DataFrame object to Stata dta format.</p>
<p>Writes the DataFrame to a Stata dataset file.
“dta” files contain a Stata dataset.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>buffer</em><em> or </em><em>path object</em>) – <p>String, path object (pathlib.Path or py._path.local.LocalPath) or
object implementing a binary write() function. If using a buffer
then the buffer will not be automatically closed after the file
data has been written.</p>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.0.0.</span></p>
</div>
<p>Previously this was “fname”</p>
</li>
<li><strong>convert_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a>) – Dictionary mapping columns containing datetime types to stata
internal format to use when writing the dates. Options are ‘tc’,
‘td’, ‘tm’, ‘tw’, ‘th’, ‘tq’, ‘ty’. Column can be either an integer
or a name. Datetime columns that do not have a conversion type
specified will be converted to ‘tc’. Raises NotImplementedError if
a datetime column has timezone information.</li>
<li><strong>write_index</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a>) – Write the index to Stata dataset.</li>
<li><strong>byteorder</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a>) – Can be “&gt;”, “&lt;”, “little”, or “big”. default is <cite>sys.byteorder</cite>.</li>
<li><strong>time_stamp</strong> (<em>datetime</em>) – A datetime to use as file creation date. Default is the current
time.</li>
<li><strong>data_label</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>, </em><em>optional</em>) – A label for the data set. Must be 80 characters or smaller.</li>
<li><strong>variable_labels</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a>) – Dictionary containing columns as keys and variable labels as
values. Each label must be 80 characters or smaller.</li>
<li><strong>version</strong> (<em>{114</em><em>, </em><em>117</em><em>, </em><em>118</em><em>, </em><em>119</em><em>, </em><em>None}</em><em>, </em><em>default 114</em>) – <p>Version to use in the output dta file. Set to None to let pandas
decide between 118 or 119 formats depending on the number of
columns in the frame. pandas Version 114 can be read by Stata 10 and
later. pandas Version 117 can be read by Stata 13 or later. pandas Version 118
is supported in Stata 14 and later. pandas Version 119 is supported in
Stata 15 and later. pandas Version 114 limits string variables to 244
characters or fewer while versions 117 and later allow strings
with lengths up to 2,000,000 characters. Versions 118 and 119
support Unicode characters, and pandas version 119 supports more than
32,767 variables.</p>
<p>pandas Version 119 should usually only be used when the number of
variables exceeds the capacity of dta format 118. Exporting
smaller datasets in format 119 may have unintended consequences,
and, as of November 2020, Stata SE cannot read pandas version 119 files.</p>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.0.0: </span>Added support for formats 118 and 119.</p>
</div>
</li>
<li><strong>convert_strl</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.10)"><em>list</em></a><em>, </em><em>optional</em>) – List of column names to convert to string columns to Stata StrL
format. Only available if version is 117. Storing strings in the
StrL format can produce smaller dta files if strings have more than
8 characters and values are repeated.</li>
<li><strong>compression</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>default 'infer'</em>) – <p>For on-the-fly compression of the output dta. If string, specifies
compression mode. If dict, value at key ‘method’ specifies
compression mode. Compression mode must be one of {‘infer’, ‘gzip’,
‘bz2’, ‘zip’, ‘xz’, None}. If compression mode is ‘infer’ and
<cite>fname</cite> is path-like, then detect compression from the following
extensions: ‘.gz’, ‘.bz2’, ‘.zip’, or ‘.xz’ (otherwise no
compression). If dict and compression mode is one of {‘zip’,
‘gzip’, ‘bz2’}, or inferred as one of the above, other entries
passed as additional compression options.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
</li>
<li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.10)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g.
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib</span></code> as header options. For other URLs (e.g.
starting with “s3://”, and “gcs://”) the key-value pairs are forwarded to
<code class="docutils literal notranslate"><span class="pre">fsspec</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more details.</p>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Raises:</th><td class="field-body"><ul class="first last">
<li><p class="first"><a class="reference external" href="https://docs.python.org/3/library/exceptions.html#NotImplementedError" title="(in Python v3.10)"><code class="xref py py-exc docutils literal notranslate"><span class="pre">NotImplementedError</span></code></a> – * If datetimes contain timezone information
* Column dtype is not representable in Stata</p>
</li>
<li><dl class="first docutils">
<dt><a class="reference external" href="https://docs.python.org/3/library/exceptions.html#ValueError" title="(in Python v3.10)"><code class="xref py py-exc docutils literal notranslate"><span class="pre">ValueError</span></code></a> – * Columns listed in convert_dates are neither datetime64[ns]</dt>
<dd><p class="first last">or datetime.datetime</p>
</dd>
</dl>
<ul class="simple">
<li>Column listed in convert_dates is not in DeferredDataFrame</li>
<li>Categorical label contains more than 32,000 characters</li>
</ul>
</li>
</ul>
</td>
</tr>
</tbody>
</table>
<p class="rubric">Differences from pandas</p>
<p>This operation has no known divergences from the pandas API.</p>
<div class="admonition seealso">
<p class="first admonition-title">See also</p>
<dl class="last docutils">
<dt><a class="reference internal" href="#apache_beam.dataframe.io.read_stata" title="apache_beam.dataframe.io.read_stata"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_stata()</span></code></a></dt>
<dd>Import Stata data files.</dd>
<dt><code class="xref py py-func docutils literal notranslate"><span class="pre">io.stata.StataWriter()</span></code></dt>
<dd>Low-level writer for Stata data files.</dd>
<dt><code class="xref py py-func docutils literal notranslate"><span class="pre">io.stata.StataWriter117()</span></code></dt>
<dd>Low-level writer for pandas version 117 files.</dd>
</dl>
</div>
<p class="rubric">Examples</p>
<p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p>
<div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;animal&#39;</span><span class="p">:</span> <span class="p">[</span><span class="s1">&#39;falcon&#39;</span><span class="p">,</span> <span class="s1">&#39;parrot&#39;</span><span class="p">,</span> <span class="s1">&#39;falcon&#39;</span><span class="p">,</span>
<span class="gp">... </span> <span class="s1">&#39;parrot&#39;</span><span class="p">],</span>
<span class="gp">... </span> <span class="s1">&#39;speed&#39;</span><span class="p">:</span> <span class="p">[</span><span class="mi">350</span><span class="p">,</span> <span class="mi">18</span><span class="p">,</span> <span class="mi">361</span><span class="p">,</span> <span class="mi">15</span><span class="p">]})</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">df</span><span class="o">.</span><span class="n">to_stata</span><span class="p">(</span><span class="s1">&#39;animals.dta&#39;</span><span class="p">)</span>
</pre></div>
</div>
</dd></dl>
</div>
</div>
</div>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="apache_beam.dataframe.pandas_top_level_functions.html" class="btn btn-neutral float-right" title="apache_beam.dataframe.pandas_top_level_functions module" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
<a href="apache_beam.dataframe.frames.html" class="btn btn-neutral float-left" title="apache_beam.dataframe.frames module" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
</div>
<hr/>
<div role="contentinfo">
<p>
&copy; Copyright
</p>
</div>
Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script type="text/javascript">
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>