| |
| |
| <!DOCTYPE html> |
| <html class="writer-html5" lang="en" data-content_root="./"> |
| <head> |
| <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" /> |
| |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /> |
| <title>apache_beam.dataframe.io module — Apache Beam 2.67.0 documentation</title> |
| <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=b86133f3" /> |
| <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=e59714d7" /> |
| |
| |
| <script src="_static/jquery.js?v=5d32c60e"></script> |
| <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script> |
| <script src="_static/documentation_options.js?v=959b4fbe"></script> |
| <script src="_static/doctools.js?v=9a2dae69"></script> |
| <script src="_static/sphinx_highlight.js?v=dc90522c"></script> |
| <script src="_static/js/theme.js"></script> |
| <link rel="index" title="Index" href="genindex.html" /> |
| <link rel="search" title="Search" href="search.html" /> |
| <link rel="next" title="apache_beam.dataframe.pandas_top_level_functions module" href="apache_beam.dataframe.pandas_top_level_functions.html" /> |
| <link rel="prev" title="apache_beam.dataframe.frames module" href="apache_beam.dataframe.frames.html" /> |
| </head> |
| |
| <body class="wy-body-for-nav"> |
| <div class="wy-grid-for-nav"> |
| <nav data-toggle="wy-nav-shift" class="wy-nav-side"> |
| <div class="wy-side-scroll"> |
| <div class="wy-side-nav-search" > |
| |
| |
| |
| <a href="index.html" class="icon icon-home"> |
| Apache Beam |
| </a> |
| <div role="search"> |
| <form id="rtd-search-form" class="wy-form" action="search.html" method="get"> |
| <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" /> |
| <input type="hidden" name="check_keywords" value="yes" /> |
| <input type="hidden" name="area" value="default" /> |
| </form> |
| </div> |
| </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu"> |
| <ul class="current"> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.coders.html">apache_beam.coders package</a></li> |
| <li class="toctree-l1 current"><a class="reference internal" href="apache_beam.dataframe.html">apache_beam.dataframe package</a><ul class="current"> |
| <li class="toctree-l2 current"><a class="reference internal" href="apache_beam.dataframe.html#submodules">Submodules</a><ul class="current"> |
| <li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.convert.html">apache_beam.dataframe.convert module</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.doctests.html">apache_beam.dataframe.doctests module</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.expressions.html">apache_beam.dataframe.expressions module</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.frame_base.html">apache_beam.dataframe.frame_base module</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.frames.html">apache_beam.dataframe.frames module</a></li> |
| <li class="toctree-l3 current"><a class="current reference internal" href="#">apache_beam.dataframe.io module</a><ul> |
| <li class="toctree-l4"><a class="reference internal" href="#sources">Sources</a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#sinks">Sinks</a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.read_gbq"><code class="docutils literal notranslate"><span class="pre">read_gbq()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.read_csv"><code class="docutils literal notranslate"><span class="pre">read_csv()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.to_csv"><code class="docutils literal notranslate"><span class="pre">to_csv()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.read_fwf"><code class="docutils literal notranslate"><span class="pre">read_fwf()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.read_json"><code class="docutils literal notranslate"><span class="pre">read_json()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.to_json"><code class="docutils literal notranslate"><span class="pre">to_json()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.read_html"><code class="docutils literal notranslate"><span class="pre">read_html()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.to_html"><code class="docutils literal notranslate"><span class="pre">to_html()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.ReadViaPandas"><code class="docutils literal notranslate"><span class="pre">ReadViaPandas</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.WriteViaPandas"><code class="docutils literal notranslate"><span class="pre">WriteViaPandas</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.read_excel"><code class="docutils literal notranslate"><span class="pre">read_excel()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.read_feather"><code class="docutils literal notranslate"><span class="pre">read_feather()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.read_parquet"><code class="docutils literal notranslate"><span class="pre">read_parquet()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.read_sas"><code class="docutils literal notranslate"><span class="pre">read_sas()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.read_spss"><code class="docutils literal notranslate"><span class="pre">read_spss()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.read_stata"><code class="docutils literal notranslate"><span class="pre">read_stata()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.to_excel"><code class="docutils literal notranslate"><span class="pre">to_excel()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.to_feather"><code class="docutils literal notranslate"><span class="pre">to_feather()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.to_parquet"><code class="docutils literal notranslate"><span class="pre">to_parquet()</span></code></a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#apache_beam.dataframe.io.to_stata"><code class="docutils literal notranslate"><span class="pre">to_stata()</span></code></a></li> |
| </ul> |
| </li> |
| <li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.pandas_top_level_functions.html">apache_beam.dataframe.pandas_top_level_functions module</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.partitionings.html">apache_beam.dataframe.partitionings module</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.schemas.html">apache_beam.dataframe.schemas module</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.transforms.html">apache_beam.dataframe.transforms module</a></li> |
| </ul> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.io.html">apache_beam.io package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.metrics.html">apache_beam.metrics package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.ml.html">apache_beam.ml package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.options.html">apache_beam.options package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.portability.html">apache_beam.portability package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.runners.html">apache_beam.runners package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.testing.html">apache_beam.testing package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.transforms.html">apache_beam.transforms package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.typehints.html">apache_beam.typehints package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.utils.html">apache_beam.utils package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.yaml.html">apache_beam.yaml package</a></li> |
| </ul> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.error.html">apache_beam.error module</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.pipeline.html">apache_beam.pipeline module</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.pvalue.html">apache_beam.pvalue module</a></li> |
| </ul> |
| |
| </div> |
| </div> |
| </nav> |
| |
| <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" > |
| <i data-toggle="wy-nav-top" class="fa fa-bars"></i> |
| <a href="index.html">Apache Beam</a> |
| </nav> |
| |
| <div class="wy-nav-content"> |
| <div class="rst-content"> |
| <div role="navigation" aria-label="Page navigation"> |
| <ul class="wy-breadcrumbs"> |
| <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li> |
| <li class="breadcrumb-item"><a href="apache_beam.dataframe.html">apache_beam.dataframe package</a></li> |
| <li class="breadcrumb-item active">apache_beam.dataframe.io module</li> |
| <li class="wy-breadcrumbs-aside"> |
| <a href="_sources/apache_beam.dataframe.io.rst.txt" rel="nofollow"> View page source</a> |
| </li> |
| </ul> |
| <hr/> |
| </div> |
| <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article"> |
| <div itemprop="articleBody"> |
| |
| <section id="module-apache_beam.dataframe.io"> |
| <span id="apache-beam-dataframe-io-module"></span><h1>apache_beam.dataframe.io module<a class="headerlink" href="#module-apache_beam.dataframe.io" title="Link to this heading"></a></h1> |
| <p>Sources and sinks for the Beam DataFrame API.</p> |
| <section id="sources"> |
| <h2>Sources<a class="headerlink" href="#sources" title="Link to this heading"></a></h2> |
| <p>This module provides analogs for pandas <code class="docutils literal notranslate"><span class="pre">read</span></code> methods, like |
| <a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.read_csv.html#pandas.read_csv" title="(in pandas v3.0.0.dev0+2296.gfcd2a5d50f)"><code class="xref py py-func docutils literal notranslate"><span class="pre">pandas.read_csv()</span></code></a>. However Beam sources like <a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a> |
| create a Beam <code class="xref py py-class docutils literal notranslate"><span class="pre">PTransform</span></code>, and return a |
| <a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame"><code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code></a> or |
| <a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredSeries" title="apache_beam.dataframe.frames.DeferredSeries"><code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredSeries</span></code></a> representing the contents |
| of the referenced file(s) or data source.</p> |
| <p>The result of these methods must be applied to a <code class="xref py py-class docutils literal notranslate"><span class="pre">Pipeline</span></code> |
| object, for example:</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">df</span> <span class="o">=</span> <span class="n">p</span> <span class="o">|</span> <span class="n">beam</span><span class="o">.</span><span class="n">dataframe</span><span class="o">.</span><span class="n">io</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="o">...</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </section> |
| <section id="sinks"> |
| <h2>Sinks<a class="headerlink" href="#sinks" title="Link to this heading"></a></h2> |
| <p>This module also defines analogs for pandas sink, or <code class="docutils literal notranslate"><span class="pre">to</span></code>, methods that |
| generate a Beam <code class="xref py py-class docutils literal notranslate"><span class="pre">PTransform</span></code>. Users should prefer calling |
| these operations from <a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame"><code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code></a> |
| instances (for example with |
| <a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame.to_csv" title="apache_beam.dataframe.frames.DeferredDataFrame.to_csv"><code class="xref py py-meth docutils literal notranslate"><span class="pre">DeferredDataFrame.to_csv</span></code></a>).</p> |
| </section> |
| <dl class="py function"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.read_gbq"> |
| <span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">read_gbq</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">table</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">project_id</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_bqstorage_api</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#read_gbq"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#apache_beam.dataframe.io.read_gbq" title="Link to this definition"></a></dt> |
| <dd><p>This function reads data from a BigQuery table and produces a |
| :class:<a href="#id1"><span class="problematic" id="id2">`</span></a>~apache_beam.dataframe.frames.DeferredDataFrame.</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><ul class="simple"> |
| <li><p><strong>table</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – Please specify a table. This can be done in the format |
| ‘PROJECT:dataset.table’ if one would not wish to utilize |
| the parameters below.</p></li> |
| <li><p><strong>dataset</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – Please specify the dataset |
| (can omit if table was specified as ‘PROJECT:dataset.table’).</p></li> |
| <li><p><strong>project_id</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – Please specify the project ID |
| (can omit if table was specified as ‘PROJECT:dataset.table’).</p></li> |
| <li><p><strong>use_bqstorage_api</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a>) – If you would like to utilize |
| the BigQuery Storage API in ReadFromBigQuery, please set |
| this flag to true. Otherwise, please set flag |
| to false or leave it unspecified.</p></li> |
| </ul> |
| </dd> |
| </dl> |
| </dd></dl> |
| |
| <dl class="py function"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.read_csv"> |
| <span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">read_csv</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">splittable</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">binary</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#read_csv"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#apache_beam.dataframe.io.read_csv" title="Link to this definition"></a></dt> |
| <dd><p>Read a comma-separated values (csv) file into DataFrame.</p> |
| <p>Also supports optionally iterating or breaking of the file |
| into chunks.</p> |
| <p>Additional help can be found in the online docs for |
| <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html">IO Tools</a>.</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><ul class="simple"> |
| <li><p><strong>filepath_or_buffer</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>path object</em><em> or </em><em>file-like object</em>) – <p>Any valid string path is acceptable. The string could be a URL. Valid |
| URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is |
| expected. A local file could be: <a class="reference external" href="file://localhost/path/to/table.csv">file://localhost/path/to/table.csv</a>.</p> |
| <p>If you want to pass in a path object, pandas accepts any <code class="docutils literal notranslate"><span class="pre">os.PathLike</span></code>.</p> |
| <p>By file-like object, we refer to objects with a <code class="docutils literal notranslate"><span class="pre">read()</span></code> method, such as |
| a file handle (e.g. via builtin <code class="docutils literal notranslate"><span class="pre">open</span></code> function) or <code class="docutils literal notranslate"><span class="pre">StringIO</span></code>.</p> |
| </p></li> |
| <li><p><strong>sep</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default '</em><em>,</em><em>'</em>) – Character or regex pattern to treat as the delimiter. If <code class="docutils literal notranslate"><span class="pre">sep=None</span></code>, the |
| C engine cannot automatically detect |
| the separator, but the Python parsing engine can, meaning the latter will |
| be used and automatically detect the separator from only the first valid |
| row of the file by Python’s builtin sniffer tool, <code class="docutils literal notranslate"><span class="pre">csv.Sniffer</span></code>. |
| In addition, separators longer than 1 character and different from |
| <code class="docutils literal notranslate"><span class="pre">'\s+'</span></code> will be interpreted as regular expressions and will also force |
| the use of the Python parsing engine. Note that regex delimiters are prone |
| to ignoring quoted data. Regex example: <code class="docutils literal notranslate"><span class="pre">'\r\t'</span></code>.</p></li> |
| <li><p><strong>delimiter</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em>) – Alias for <code class="docutils literal notranslate"><span class="pre">sep</span></code>.</p></li> |
| <li><p><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>Sequence</em><em> of </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>'infer'</em><em> or </em><em>None</em><em>, </em><em>default 'infer'</em>) – Row number(s) containing column labels and marking the start of the |
| data (zero-indexed). Default behavior is to infer the column names: if no <code class="docutils literal notranslate"><span class="pre">names</span></code> |
| are passed the behavior is identical to <code class="docutils literal notranslate"><span class="pre">header=0</span></code> and column |
| names are inferred from the first line of the file, if column |
| names are passed explicitly to <code class="docutils literal notranslate"><span class="pre">names</span></code> then the behavior is identical to |
| <code class="docutils literal notranslate"><span class="pre">header=None</span></code>. Explicitly pass <code class="docutils literal notranslate"><span class="pre">header=0</span></code> to be able to |
| replace existing names. The header can be a list of integers that |
| specify row locations for a <a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.MultiIndex.html#pandas.MultiIndex" title="(in pandas v3.0.0.dev0+2296.gfcd2a5d50f)"><code class="xref py py-class docutils literal notranslate"><span class="pre">MultiIndex</span></code></a> on the columns |
| e.g. <code class="docutils literal notranslate"><span class="pre">[0,</span> <span class="pre">1,</span> <span class="pre">3]</span></code>. Intervening rows that are not specified will be |
| skipped (e.g. 2 in this example is skipped). Note that this |
| parameter ignores commented lines and empty lines if |
| <code class="docutils literal notranslate"><span class="pre">skip_blank_lines=True</span></code>, so <code class="docutils literal notranslate"><span class="pre">header=0</span></code> denotes the first line of |
| data rather than the first line of the file.</p></li> |
| <li><p><strong>names</strong> (<em>Sequence</em><em> of </em><em>Hashable</em><em>, </em><em>optional</em>) – Sequence of column labels to apply. If the file contains a header row, |
| then you should explicitly pass <code class="docutils literal notranslate"><span class="pre">header=0</span></code> to override the column names. |
| Duplicates in this list are not allowed.</p></li> |
| <li><p><strong>index_col</strong> (<em>Hashable</em><em>, </em><em>Sequence</em><em> of </em><em>Hashable</em><em> or </em><em>False</em><em>, </em><em>optional</em>) – <p>Column(s) to use as row label(s), denoted either by column labels or column |
| indices. If a sequence of labels or indices is given, <a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.MultiIndex.html#pandas.MultiIndex" title="(in pandas v3.0.0.dev0+2296.gfcd2a5d50f)"><code class="xref py py-class docutils literal notranslate"><span class="pre">MultiIndex</span></code></a> |
| will be formed for the row labels.</p> |
| <p>Note: <code class="docutils literal notranslate"><span class="pre">index_col=False</span></code> can be used to force pandas to <em>not</em> use the first |
| column as the index, e.g., when you have a malformed file with delimiters at |
| the end of each line.</p> |
| </p></li> |
| <li><p><strong>usecols</strong> (<em>Sequence</em><em> of </em><em>Hashable</em><em> or </em><em>Callable</em><em>, </em><em>optional</em>) – <p>Subset of columns to select, denoted either by column labels or column indices. |
| If list-like, all elements must either |
| be positional (i.e. integer indices into the document columns) or strings |
| that correspond to column names provided either by the user in <code class="docutils literal notranslate"><span class="pre">names</span></code> or |
| inferred from the document header row(s). If <code class="docutils literal notranslate"><span class="pre">names</span></code> are given, the document |
| header row(s) are not taken into account. For example, a valid list-like |
| <code class="docutils literal notranslate"><span class="pre">usecols</span></code> parameter would be <code class="docutils literal notranslate"><span class="pre">[0,</span> <span class="pre">1,</span> <span class="pre">2]</span></code> or <code class="docutils literal notranslate"><span class="pre">['foo',</span> <span class="pre">'bar',</span> <span class="pre">'baz']</span></code>. |
| Element order is ignored, so <code class="docutils literal notranslate"><span class="pre">usecols=[0,</span> <span class="pre">1]</span></code> is the same as <code class="docutils literal notranslate"><span class="pre">[1,</span> <span class="pre">0]</span></code>. |
| To instantiate a <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> from <code class="docutils literal notranslate"><span class="pre">data</span></code> with element order |
| preserved use <code class="docutils literal notranslate"><span class="pre">pd.read_csv(data,</span> <span class="pre">usecols=['foo',</span> <span class="pre">'bar'])[['foo',</span> <span class="pre">'bar']]</span></code> |
| for columns in <code class="docutils literal notranslate"><span class="pre">['foo',</span> <span class="pre">'bar']</span></code> order or |
| <code class="docutils literal notranslate"><span class="pre">pd.read_csv(data,</span> <span class="pre">usecols=['foo',</span> <span class="pre">'bar'])[['bar',</span> <span class="pre">'foo']]</span></code> |
| for <code class="docutils literal notranslate"><span class="pre">['bar',</span> <span class="pre">'foo']</span></code> order.</p> |
| <p>If callable, the callable function will be evaluated against the column |
| names, returning names where the callable function evaluates to <code class="docutils literal notranslate"><span class="pre">True</span></code>. An |
| example of a valid callable argument would be <code class="docutils literal notranslate"><span class="pre">lambda</span> <span class="pre">x:</span> <span class="pre">x.upper()</span> <span class="pre">in</span> |
| <span class="pre">['AAA',</span> <span class="pre">'BBB',</span> <span class="pre">'DDD']</span></code>. Using this parameter results in much faster |
| parsing time and lower memory usage.</p> |
| </p></li> |
| <li><p><strong>dtype</strong> (<em>dtype</em><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em> of </em><em>{Hashable : dtype}</em><em>, </em><em>optional</em>) – <p>Data type(s) to apply to either the whole dataset or individual columns. |
| E.g., <code class="docutils literal notranslate"><span class="pre">{'a':</span> <span class="pre">np.float64,</span> <span class="pre">'b':</span> <span class="pre">np.int32,</span> <span class="pre">'c':</span> <span class="pre">'Int64'}</span></code> |
| Use <code class="docutils literal notranslate"><span class="pre">str</span></code> or <code class="docutils literal notranslate"><span class="pre">object</span></code> together with suitable <code class="docutils literal notranslate"><span class="pre">na_values</span></code> settings |
| to preserve and not interpret <code class="docutils literal notranslate"><span class="pre">dtype</span></code>. |
| If <code class="docutils literal notranslate"><span class="pre">converters</span></code> are specified, they will be applied INSTEAD |
| of <code class="docutils literal notranslate"><span class="pre">dtype</span></code> conversion.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 1.5.0: </span>Support for <code class="docutils literal notranslate"><span class="pre">defaultdict</span></code> was added. Specify a <code class="docutils literal notranslate"><span class="pre">defaultdict</span></code> as input where |
| the default determines the <code class="docutils literal notranslate"><span class="pre">dtype</span></code> of the columns which are not explicitly |
| listed.</p> |
| </div> |
| </p></li> |
| <li><p><strong>engine</strong> (<em>{'c'</em><em>, </em><em>'python'</em><em>, </em><em>'pyarrow'}</em><em>, </em><em>optional</em>) – <p>Parser engine to use. The C and pyarrow engines are faster, while the python engine |
| is currently more feature-complete. Multithreading is currently only supported by |
| the pyarrow engine.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 1.4.0: </span>The ‘pyarrow’ engine was added as an <em>experimental</em> engine, and some features |
| are unsupported, or may not work correctly, with this engine.</p> |
| </div> |
| </p></li> |
| <li><p><strong>converters</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em> of </em><em>{Hashable : Callable}</em><em>, </em><em>optional</em>) – Functions for converting values in specified columns. Keys can either |
| be column labels or column indices.</p></li> |
| <li><p><strong>true_values</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>, </em><em>optional</em>) – Values to consider as <code class="docutils literal notranslate"><span class="pre">True</span></code> in addition to case-insensitive variants of ‘True’.</p></li> |
| <li><p><strong>false_values</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>, </em><em>optional</em>) – Values to consider as <code class="docutils literal notranslate"><span class="pre">False</span></code> in addition to case-insensitive variants of ‘False’.</p></li> |
| <li><p><strong>skipinitialspace</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default False</em>) – Skip spaces after delimiter.</p></li> |
| <li><p><strong>skiprows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em> of </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em> or </em><em>Callable</em><em>, </em><em>optional</em>) – <p>Line numbers to skip (0-indexed) or number of lines to skip (<code class="docutils literal notranslate"><span class="pre">int</span></code>) |
| at the start of the file.</p> |
| <p>If callable, the callable function will be evaluated against the row |
| indices, returning <code class="docutils literal notranslate"><span class="pre">True</span></code> if the row should be skipped and <code class="docutils literal notranslate"><span class="pre">False</span></code> otherwise. |
| An example of a valid callable argument would be <code class="docutils literal notranslate"><span class="pre">lambda</span> <span class="pre">x:</span> <span class="pre">x</span> <span class="pre">in</span> <span class="pre">[0,</span> <span class="pre">2]</span></code>.</p> |
| </p></li> |
| <li><p><strong>skipfooter</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>default 0</em>) – Number of lines at bottom of file to skip (Unsupported with <code class="docutils literal notranslate"><span class="pre">engine='c'</span></code>).</p></li> |
| <li><p><strong>nrows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – Number of rows of file to read. Useful for reading pieces of large files.</p></li> |
| <li><p><strong>na_values</strong> (<em>Hashable</em><em>, </em><em>Iterable</em><em> of </em><em>Hashable</em><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em> of </em><em>{Hashable : Iterable}</em><em>, </em><em>optional</em>) – Additional strings to recognize as <code class="docutils literal notranslate"><span class="pre">NA</span></code>/<code class="docutils literal notranslate"><span class="pre">NaN</span></code>. If <code class="docutils literal notranslate"><span class="pre">dict</span></code> passed, specific |
| per-column <code class="docutils literal notranslate"><span class="pre">NA</span></code> values. By default the following values are interpreted as |
| <code class="docutils literal notranslate"><span class="pre">NaN</span></code>: “ “, “#N/A”, “#N/A N/A”, “#NA”, “-1.#IND”, “-1.#QNAN”, “-NaN”, “-nan”, |
| “1.#IND”, “1.#QNAN”, “<NA>”, “N/A”, “NA”, “NULL”, “NaN”, “None”, |
| “n/a”, “nan”, “null “.</p></li> |
| <li><p><strong>keep_default_na</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – <p>Whether or not to include the default <code class="docutils literal notranslate"><span class="pre">NaN</span></code> values when parsing the data. |
| Depending on whether <code class="docutils literal notranslate"><span class="pre">na_values</span></code> is passed in, the behavior is as follows:</p> |
| <ul> |
| <li><p>If <code class="docutils literal notranslate"><span class="pre">keep_default_na</span></code> is <code class="docutils literal notranslate"><span class="pre">True</span></code>, and <code class="docutils literal notranslate"><span class="pre">na_values</span></code> are specified, <code class="docutils literal notranslate"><span class="pre">na_values</span></code> |
| is appended to the default <code class="docutils literal notranslate"><span class="pre">NaN</span></code> values used for parsing.</p></li> |
| <li><p>If <code class="docutils literal notranslate"><span class="pre">keep_default_na</span></code> is <code class="docutils literal notranslate"><span class="pre">True</span></code>, and <code class="docutils literal notranslate"><span class="pre">na_values</span></code> are not specified, only |
| the default <code class="docutils literal notranslate"><span class="pre">NaN</span></code> values are used for parsing.</p></li> |
| <li><p>If <code class="docutils literal notranslate"><span class="pre">keep_default_na</span></code> is <code class="docutils literal notranslate"><span class="pre">False</span></code>, and <code class="docutils literal notranslate"><span class="pre">na_values</span></code> are specified, only |
| the <code class="docutils literal notranslate"><span class="pre">NaN</span></code> values specified <code class="docutils literal notranslate"><span class="pre">na_values</span></code> are used for parsing.</p></li> |
| <li><p>If <code class="docutils literal notranslate"><span class="pre">keep_default_na</span></code> is <code class="docutils literal notranslate"><span class="pre">False</span></code>, and <code class="docutils literal notranslate"><span class="pre">na_values</span></code> are not specified, no |
| strings will be parsed as <code class="docutils literal notranslate"><span class="pre">NaN</span></code>.</p></li> |
| </ul> |
| <p>Note that if <code class="docutils literal notranslate"><span class="pre">na_filter</span></code> is passed in as <code class="docutils literal notranslate"><span class="pre">False</span></code>, the <code class="docutils literal notranslate"><span class="pre">keep_default_na</span></code> and |
| <code class="docutils literal notranslate"><span class="pre">na_values</span></code> parameters will be ignored.</p> |
| </p></li> |
| <li><p><strong>na_filter</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – Detect missing value markers (empty strings and the value of <code class="docutils literal notranslate"><span class="pre">na_values</span></code>). In |
| data without any <code class="docutils literal notranslate"><span class="pre">NA</span></code> values, passing <code class="docutils literal notranslate"><span class="pre">na_filter=False</span></code> can improve the |
| performance of reading a large file.</p></li> |
| <li><p><strong>verbose</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default False</em>) – <p>Indicate number of <code class="docutils literal notranslate"><span class="pre">NA</span></code> values placed in non-numeric columns.</p> |
| <div class="deprecated"> |
| <p><span class="versionmodified deprecated">Deprecated since version 2.2.0.</span></p> |
| </div> |
| </p></li> |
| <li><p><strong>skip_blank_lines</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – If <code class="docutils literal notranslate"><span class="pre">True</span></code>, skip over blank lines rather than interpreting as <code class="docutils literal notranslate"><span class="pre">NaN</span></code> values.</p></li> |
| <li><p><strong>parse_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em> of </em><em>Hashable</em><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em> of </em><em>lists</em><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em> of </em><em>{Hashable : list}</em><em>, </em><em>default False</em>) – <p>The behavior is as follows:</p> |
| <ul> |
| <li><p><code class="docutils literal notranslate"><span class="pre">bool</span></code>. If <code class="docutils literal notranslate"><span class="pre">True</span></code> -> try parsing the index. Note: Automatically set to |
| <code class="docutils literal notranslate"><span class="pre">True</span></code> if <code class="docutils literal notranslate"><span class="pre">date_format</span></code> or <code class="docutils literal notranslate"><span class="pre">date_parser</span></code> arguments have been passed.</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">list</span></code> of <code class="docutils literal notranslate"><span class="pre">int</span></code> or names. e.g. If <code class="docutils literal notranslate"><span class="pre">[1,</span> <span class="pre">2,</span> <span class="pre">3]</span></code> -> try parsing columns 1, 2, 3 |
| each as a separate date column.</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">list</span></code> of <code class="docutils literal notranslate"><span class="pre">list</span></code>. e.g. If <code class="docutils literal notranslate"><span class="pre">[[1,</span> <span class="pre">3]]</span></code> -> combine columns 1 and 3 and parse |
| as a single date column. Values are joined with a space before parsing.</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">dict</span></code>, e.g. <code class="docutils literal notranslate"><span class="pre">{'foo'</span> <span class="pre">:</span> <span class="pre">[1,</span> <span class="pre">3]}</span></code> -> parse columns 1, 3 as date and call |
| result ‘foo’. Values are joined with a space before parsing.</p></li> |
| </ul> |
| <p>If a column or index cannot be represented as an array of <code class="docutils literal notranslate"><span class="pre">datetime</span></code>, |
| say because of an unparsable value or a mixture of timezones, the column |
| or index will be returned unaltered as an <code class="docutils literal notranslate"><span class="pre">object</span></code> data type. For |
| non-standard <code class="docutils literal notranslate"><span class="pre">datetime</span></code> parsing, use <a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.to_datetime.html#pandas.to_datetime" title="(in pandas v3.0.0.dev0+2296.gfcd2a5d50f)"><code class="xref py py-func docutils literal notranslate"><span class="pre">to_datetime()</span></code></a> after |
| <a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.read_csv.html#pandas.read_csv" title="(in pandas v3.0.0.dev0+2296.gfcd2a5d50f)"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a>.</p> |
| <p>Note: A fast-path exists for iso8601-formatted dates.</p> |
| </p></li> |
| <li><p><strong>infer_datetime_format</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default False</em>) – <p>If <code class="docutils literal notranslate"><span class="pre">True</span></code> and <code class="docutils literal notranslate"><span class="pre">parse_dates</span></code> is enabled, pandas will attempt to infer the |
| format of the <code class="docutils literal notranslate"><span class="pre">datetime</span></code> strings in the columns, and if it can be inferred, |
| switch to a faster method of parsing them. In some cases this can increase |
| the parsing speed by 5-10x.</p> |
| <div class="deprecated"> |
| <p><span class="versionmodified deprecated">Deprecated since version 2.0.0: </span>A strict version of this argument is now the default, passing it has no effect.</p> |
| </div> |
| </p></li> |
| <li><p><strong>keep_date_col</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default False</em>) – If <code class="docutils literal notranslate"><span class="pre">True</span></code> and <code class="docutils literal notranslate"><span class="pre">parse_dates</span></code> specifies combining multiple columns then |
| keep the original columns.</p></li> |
| <li><p><strong>date_parser</strong> (<em>Callable</em><em>, </em><em>optional</em>) – <p>Function to use for converting a sequence of string columns to an array of |
| <code class="docutils literal notranslate"><span class="pre">datetime</span></code> instances. The default uses <code class="docutils literal notranslate"><span class="pre">dateutil.parser.parser</span></code> to do the |
| conversion. pandas will try to call <code class="docutils literal notranslate"><span class="pre">date_parser</span></code> in three different ways, |
| advancing to the next if an exception occurs: 1) Pass one or more arrays |
| (as defined by <code class="docutils literal notranslate"><span class="pre">parse_dates</span></code>) as arguments; 2) concatenate (row-wise) the |
| string values from the columns defined by <code class="docutils literal notranslate"><span class="pre">parse_dates</span></code> into a single array |
| and pass that; and 3) call <code class="docutils literal notranslate"><span class="pre">date_parser</span></code> once for each row using one or |
| more strings (corresponding to the columns defined by <code class="docutils literal notranslate"><span class="pre">parse_dates</span></code>) as |
| arguments.</p> |
| <div class="deprecated"> |
| <p><span class="versionmodified deprecated">Deprecated since version 2.0.0: </span>Use <code class="docutils literal notranslate"><span class="pre">date_format</span></code> instead, or read in as <code class="docutils literal notranslate"><span class="pre">object</span></code> and then apply |
| <a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.to_datetime.html#pandas.to_datetime" title="(in pandas v3.0.0.dev0+2296.gfcd2a5d50f)"><code class="xref py py-func docutils literal notranslate"><span class="pre">to_datetime()</span></code></a> as-needed.</p> |
| </div> |
| </p></li> |
| <li><p><strong>date_format</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em> of </em><em>column -> format</em><em>, </em><em>optional</em>) – <p>Format to use for parsing dates when used in conjunction with <code class="docutils literal notranslate"><span class="pre">parse_dates</span></code>. |
| The strftime to parse time, e.g. <code class="xref py py-const docutils literal notranslate"><span class="pre">"%d/%m/%Y"</span></code>. See |
| <a class="reference external" href="https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior">strftime documentation</a> for more information on choices, though |
| note that <code class="xref py py-const docutils literal notranslate"><span class="pre">"%f"</span></code> will parse all the way up to nanoseconds. |
| You can also pass:</p> |
| <ul> |
| <li><dl class="simple"> |
| <dt>”ISO8601”, to parse any <a class="reference external" href="https://en.wikipedia.org/wiki/ISO_8601">ISO8601</a></dt><dd><p>time string (not necessarily in exactly the same format);</p> |
| </dd> |
| </dl> |
| </li> |
| <li><dl class="simple"> |
| <dt>”mixed”, to infer the format for each element individually. This is risky,</dt><dd><p>and you should probably use it along with <cite>dayfirst</cite>.</p> |
| </dd> |
| </dl> |
| </li> |
| </ul> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 2.0.0.</span></p> |
| </div> |
| </p></li> |
| <li><p><strong>dayfirst</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default False</em>) – DD/MM format dates, international and European format.</p></li> |
| <li><p><strong>cache_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – If <code class="docutils literal notranslate"><span class="pre">True</span></code>, use a cache of unique, converted dates to apply the <code class="docutils literal notranslate"><span class="pre">datetime</span></code> |
| conversion. May produce significant speed-up when parsing duplicate |
| date strings, especially ones with timezone offsets.</p></li> |
| <li><p><strong>iterator</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default False</em>) – Return <code class="docutils literal notranslate"><span class="pre">TextFileReader</span></code> object for iteration or getting chunks with |
| <code class="docutils literal notranslate"><span class="pre">get_chunk()</span></code>.</p></li> |
| <li><p><strong>chunksize</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – Number of lines to read from the file per chunk. Passing a value will cause the |
| function to return a <code class="docutils literal notranslate"><span class="pre">TextFileReader</span></code> object for iteration. |
| See the <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking">IO Tools docs</a> |
| for more information on <code class="docutils literal notranslate"><span class="pre">iterator</span></code> and <code class="docutils literal notranslate"><span class="pre">chunksize</span></code>.</p></li> |
| <li><p><strong>compression</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>default 'infer'</em>) – <p>For on-the-fly decompression of on-disk data. If ‘infer’ and ‘filepath_or_buffer’ is |
| path-like, then detect compression from the following extensions: ‘.gz’, |
| ‘.bz2’, ‘.zip’, ‘.xz’, ‘.zst’, ‘.tar’, ‘.tar.gz’, ‘.tar.xz’ or ‘.tar.bz2’ |
| (otherwise no compression). |
| If using ‘zip’ or ‘tar’, the ZIP file must contain only one data file to be read in. |
| Set to <code class="docutils literal notranslate"><span class="pre">None</span></code> for no decompression. |
| Can also be a dict with key <code class="docutils literal notranslate"><span class="pre">'method'</span></code> set |
| to one of {<code class="docutils literal notranslate"><span class="pre">'zip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'gzip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'bz2'</span></code>, <code class="docutils literal notranslate"><span class="pre">'zstd'</span></code>, <code class="docutils literal notranslate"><span class="pre">'xz'</span></code>, <code class="docutils literal notranslate"><span class="pre">'tar'</span></code>} and |
| other key-value pairs are forwarded to |
| <code class="docutils literal notranslate"><span class="pre">zipfile.ZipFile</span></code>, <code class="docutils literal notranslate"><span class="pre">gzip.GzipFile</span></code>, |
| <code class="docutils literal notranslate"><span class="pre">bz2.BZ2File</span></code>, <code class="docutils literal notranslate"><span class="pre">zstandard.ZstdDecompressor</span></code>, <code class="docutils literal notranslate"><span class="pre">lzma.LZMAFile</span></code> or |
| <code class="docutils literal notranslate"><span class="pre">tarfile.TarFile</span></code>, respectively. |
| As an example, the following could be passed for Zstandard decompression using a |
| custom compression dictionary: |
| <code class="docutils literal notranslate"><span class="pre">compression={'method':</span> <span class="pre">'zstd',</span> <span class="pre">'dict_data':</span> <span class="pre">my_compression_dict}</span></code>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 1.5.0: </span>Added support for <cite>.tar</cite> files.</p> |
| </div> |
| <div class="versionchanged"> |
| <p><span class="versionmodified changed">Changed in version 1.4.0: </span>Zstandard support.</p> |
| </div> |
| </p></li> |
| <li><p><strong>thousands</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> (</em><em>length 1</em><em>)</em><em>, </em><em>optional</em>) – Character acting as the thousands separator in numerical values.</p></li> |
| <li><p><strong>decimal</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> (</em><em>length 1</em><em>)</em><em>, </em><em>default '.'</em>) – Character to recognize as decimal point (e.g., use ‘,’ for European data).</p></li> |
| <li><p><strong>lineterminator</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> (</em><em>length 1</em><em>)</em><em>, </em><em>optional</em>) – Character used to denote a line break. Only valid with C parser.</p></li> |
| <li><p><strong>quotechar</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> (</em><em>length 1</em><em>)</em><em>, </em><em>optional</em>) – Character used to denote the start and end of a quoted item. Quoted |
| items can include the <code class="docutils literal notranslate"><span class="pre">delimiter</span></code> and it will be ignored.</p></li> |
| <li><p><strong>quoting</strong> (<em>{0</em><em> or </em><em>csv.QUOTE_MINIMAL</em><em>, </em><em>1</em><em> or </em><em>csv.QUOTE_ALL</em><em>, </em><em>2</em><em> or </em><em>csv.QUOTE_NONNUMERIC</em><em>, </em><em>3</em><em> or </em><em>csv.QUOTE_NONE}</em><em>, </em><em>default csv.QUOTE_MINIMAL</em>) – Control field quoting behavior per <code class="docutils literal notranslate"><span class="pre">csv.QUOTE_*</span></code> constants. Default is |
| <code class="docutils literal notranslate"><span class="pre">csv.QUOTE_MINIMAL</span></code> (i.e., 0) which implies that only fields containing special |
| characters are quoted (e.g., characters defined in <code class="docutils literal notranslate"><span class="pre">quotechar</span></code>, <code class="docutils literal notranslate"><span class="pre">delimiter</span></code>, |
| or <code class="docutils literal notranslate"><span class="pre">lineterminator</span></code>.</p></li> |
| <li><p><strong>doublequote</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – When <code class="docutils literal notranslate"><span class="pre">quotechar</span></code> is specified and <code class="docutils literal notranslate"><span class="pre">quoting</span></code> is not <code class="docutils literal notranslate"><span class="pre">QUOTE_NONE</span></code>, indicate |
| whether or not to interpret two consecutive <code class="docutils literal notranslate"><span class="pre">quotechar</span></code> elements INSIDE a |
| field as a single <code class="docutils literal notranslate"><span class="pre">quotechar</span></code> element.</p></li> |
| <li><p><strong>escapechar</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> (</em><em>length 1</em><em>)</em><em>, </em><em>optional</em>) – Character used to escape other characters.</p></li> |
| <li><p><strong>comment</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> (</em><em>length 1</em><em>)</em><em>, </em><em>optional</em>) – Character indicating that the remainder of line should not be parsed. |
| If found at the beginning |
| of a line, the line will be ignored altogether. This parameter must be a |
| single character. Like empty lines (as long as <code class="docutils literal notranslate"><span class="pre">skip_blank_lines=True</span></code>), |
| fully commented lines are ignored by the parameter <code class="docutils literal notranslate"><span class="pre">header</span></code> but not by |
| <code class="docutils literal notranslate"><span class="pre">skiprows</span></code>. For example, if <code class="docutils literal notranslate"><span class="pre">comment='#'</span></code>, parsing |
| <code class="docutils literal notranslate"><span class="pre">#empty\na,b,c\n1,2,3</span></code> with <code class="docutils literal notranslate"><span class="pre">header=0</span></code> will result in <code class="docutils literal notranslate"><span class="pre">'a,b,c'</span></code> being |
| treated as the header.</p></li> |
| <li><p><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em><em>, </em><em>default 'utf-8'</em>) – Encoding to use for UTF when reading/writing (ex. <code class="docutils literal notranslate"><span class="pre">'utf-8'</span></code>). <a class="reference external" href="https://docs.python.org/3/library/codecs.html#standard-encodings">List of Python |
| standard encodings</a> .</p></li> |
| <li><p><strong>encoding_errors</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em><em>, </em><em>default 'strict'</em>) – <p>How encoding errors are treated. <a class="reference external" href="https://docs.python.org/3/library/codecs.html#error-handlers">List of possible values</a> .</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 1.3.0.</span></p> |
| </div> |
| </p></li> |
| <li><p><strong>dialect</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/csv.html#csv.Dialect" title="(in Python v3.13)"><em>csv.Dialect</em></a><em>, </em><em>optional</em>) – If provided, this parameter will override values (default or not) for the |
| following parameters: <code class="docutils literal notranslate"><span class="pre">delimiter</span></code>, <code class="docutils literal notranslate"><span class="pre">doublequote</span></code>, <code class="docutils literal notranslate"><span class="pre">escapechar</span></code>, |
| <code class="docutils literal notranslate"><span class="pre">skipinitialspace</span></code>, <code class="docutils literal notranslate"><span class="pre">quotechar</span></code>, and <code class="docutils literal notranslate"><span class="pre">quoting</span></code>. If it is necessary to |
| override values, a <code class="docutils literal notranslate"><span class="pre">ParserWarning</span></code> will be issued. See <code class="docutils literal notranslate"><span class="pre">csv.Dialect</span></code> |
| documentation for more details.</p></li> |
| <li><p><strong>on_bad_lines</strong> (<em>{'error'</em><em>, </em><em>'warn'</em><em>, </em><em>'skip'}</em><em> or </em><em>Callable</em><em>, </em><em>default 'error'</em>) – <p>Specifies what to do upon encountering a bad line (a line with too many fields). |
| Allowed values are :</p> |
| <ul> |
| <li><p><code class="docutils literal notranslate"><span class="pre">'error'</span></code>, raise an Exception when a bad line is encountered.</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">'warn'</span></code>, raise a warning when a bad line is encountered and skip that line.</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">'skip'</span></code>, skip bad lines without raising or warning when they are encountered.</p></li> |
| </ul> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 1.3.0.</span></p> |
| </div> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 1.4.0: </span></p> |
| <ul> |
| <li><p>Callable, function with signature |
| <code class="docutils literal notranslate"><span class="pre">(bad_line:</span> <span class="pre">list[str])</span> <span class="pre">-></span> <span class="pre">list[str]</span> <span class="pre">|</span> <span class="pre">None</span></code> that will process a single |
| bad line. <code class="docutils literal notranslate"><span class="pre">bad_line</span></code> is a list of strings split by the <code class="docutils literal notranslate"><span class="pre">sep</span></code>. |
| If the function returns <code class="docutils literal notranslate"><span class="pre">None</span></code>, the bad line will be ignored. |
| If the function returns a new <code class="docutils literal notranslate"><span class="pre">list</span></code> of strings with more elements than |
| expected, a <code class="docutils literal notranslate"><span class="pre">ParserWarning</span></code> will be emitted while dropping extra elements. |
| Only supported when <code class="docutils literal notranslate"><span class="pre">engine='python'</span></code></p></li> |
| </ul> |
| </div> |
| <div class="versionchanged"> |
| <p><span class="versionmodified changed">Changed in version 2.2.0: </span></p> |
| <ul> |
| <li><p>Callable, function with signature |
| as described in <a class="reference external" href="https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html#pyarrow.csv.ParseOptions.invalid_row_handler">pyarrow documentation</a> when <code class="docutils literal notranslate"><span class="pre">engine='pyarrow'</span></code></p></li> |
| </ul> |
| </div> |
| </p></li> |
| <li><p><strong>delim_whitespace</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default False</em>) – <p>Specifies whether or not whitespace (e.g. <code class="docutils literal notranslate"><span class="pre">'</span> <span class="pre">'</span></code> or <code class="docutils literal notranslate"><span class="pre">'\t'</span></code>) will be |
| used as the <code class="docutils literal notranslate"><span class="pre">sep</span></code> delimiter. Equivalent to setting <code class="docutils literal notranslate"><span class="pre">sep='\s+'</span></code>. If this option |
| is set to <code class="docutils literal notranslate"><span class="pre">True</span></code>, nothing should be passed in for the <code class="docutils literal notranslate"><span class="pre">delimiter</span></code> |
| parameter.</p> |
| <div class="deprecated"> |
| <p><span class="versionmodified deprecated">Deprecated since version 2.2.0: </span>Use <code class="docutils literal notranslate"><span class="pre">sep="\s+"</span></code> instead.</p> |
| </div> |
| </p></li> |
| <li><p><strong>low_memory</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – Internally process the file in chunks, resulting in lower memory use |
| while parsing, but possibly mixed type inference. To ensure no mixed |
| types either set <code class="docutils literal notranslate"><span class="pre">False</span></code>, or specify the type with the <code class="docutils literal notranslate"><span class="pre">dtype</span></code> parameter. |
| Note that the entire file is read into a single <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> |
| regardless, use the <code class="docutils literal notranslate"><span class="pre">chunksize</span></code> or <code class="docutils literal notranslate"><span class="pre">iterator</span></code> parameter to return the data in |
| chunks. (Only valid with C parser).</p></li> |
| <li><p><strong>memory_map</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default False</em>) – If a filepath is provided for <code class="docutils literal notranslate"><span class="pre">filepath_or_buffer</span></code>, map the file object |
| directly onto memory and access the data directly from there. Using this |
| option can improve performance because there is no longer any I/O overhead.</p></li> |
| <li><p><strong>float_precision</strong> (<em>{'high'</em><em>, </em><em>'legacy'</em><em>, </em><em>'round_trip'}</em><em>, </em><em>optional</em>) – Specifies which converter the C engine should use for floating-point |
| values. The options are <code class="docutils literal notranslate"><span class="pre">None</span></code> or <code class="docutils literal notranslate"><span class="pre">'high'</span></code> for the ordinary converter, |
| <code class="docutils literal notranslate"><span class="pre">'legacy'</span></code> for the original lower precision pandas converter, and |
| <code class="docutils literal notranslate"><span class="pre">'round_trip'</span></code> for the round-trip converter.</p></li> |
| <li><p><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>optional</em>) – Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p></li> |
| <li><p><strong>dtype_backend</strong> (<em>{'numpy_nullable'</em><em>, </em><em>'pyarrow'}</em><em>, </em><em>default 'numpy_nullable'</em>) – <p>Back-end data type applied to the resultant <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> |
| (still experimental). Behaviour is as follows:</p> |
| <ul> |
| <li><p><code class="docutils literal notranslate"><span class="pre">"numpy_nullable"</span></code>: returns nullable-dtype-backed <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> |
| (default).</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">"pyarrow"</span></code>: returns pyarrow-backed nullable <code class="xref py py-class docutils literal notranslate"><span class="pre">ArrowDtype</span></code> |
| DeferredDataFrame.</p></li> |
| </ul> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 2.0.</span></p> |
| </div> |
| </p></li> |
| </ul> |
| </dd> |
| <dt class="field-even">Returns<span class="colon">:</span></dt> |
| <dd class="field-even"><p>A comma-separated values (csv) file is returned as two-dimensional |
| data structure with labeled axes.</p> |
| </dd> |
| <dt class="field-odd">Return type<span class="colon">:</span></dt> |
| <dd class="field-odd"><p><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a> or TextFileReader</p> |
| </dd> |
| </dl> |
| <p class="rubric">Differences from pandas</p> |
| <p>If your files are large and records do not contain quoted newlines, you may |
| pass the extra argument <code class="docutils literal notranslate"><span class="pre">splittable=True</span></code> to enable dynamic splitting for |
| this read on newlines. Using this option for records that do contain quoted |
| newlines may result in partial records and data corruption.</p> |
| <div class="admonition seealso"> |
| <p class="admonition-title">See also</p> |
| <dl class="simple"> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">DeferredDataFrame.to_csv</span></code></dt><dd><p>Write DeferredDataFrame to a comma-separated values (csv) file.</p> |
| </dd> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">read_table</span></code></dt><dd><p>Read general delimited file into DeferredDataFrame.</p> |
| </dd> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_fwf" title="apache_beam.dataframe.io.read_fwf"><code class="xref py py-obj docutils literal notranslate"><span class="pre">read_fwf</span></code></a></dt><dd><p>Read a table of fixed-width formatted lines into DeferredDataFrame.</p> |
| </dd> |
| </dl> |
| </div> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API. In addition, some arguments shown here may not be supported, see <strong>‘Differences from pandas’</strong> for details.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">'data.csv'</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="py function"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.to_csv"> |
| <span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">to_csv</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">df</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">transform_label</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#to_csv"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#apache_beam.dataframe.io.to_csv" title="Link to this definition"></a></dt> |
| <dd><p>Write object to a comma-separated values (csv) file.</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><ul class="simple"> |
| <li><p><strong>path_or_buf</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>path object</em><em>, </em><em>file-like object</em><em>, or </em><em>None</em><em>, </em><em>default None</em>) – String, path object (implementing os.PathLike[str]), or file-like |
| object implementing a write() function. If None, the result is |
| returned as a string. If a non-binary file object is passed, it should |
| be opened with <cite>newline=’’</cite>, disabling universal newlines. If a binary |
| file object is passed, <cite>mode</cite> might need to contain a <cite>‘b’</cite>.</p></li> |
| <li><p><strong>sep</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default '</em><em>,</em><em>'</em>) – String of length 1. Field delimiter for the output file.</p></li> |
| <li><p><strong>na_rep</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default ''</em>) – Missing data representation.</p></li> |
| <li><p><strong>float_format</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>Callable</em><em>, </em><em>default None</em>) – Format string for floating point numbers. If a Callable is given, it takes |
| precedence over other numeric formatting parameters, like decimal.</p></li> |
| <li><p><strong>columns</strong> (<em>sequence</em><em>, </em><em>optional</em>) – Columns to write.</p></li> |
| <li><p><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em> of </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default True</em>) – Write out the column names. If a list of strings is given it is |
| assumed to be aliases for the column names.</p></li> |
| <li><p><strong>index</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – Write row names (index).</p></li> |
| <li><p><strong>index_label</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><em>sequence</em><em>, or </em><em>False</em><em>, </em><em>default None</em>) – Column label for index column(s) if desired. If None is given, and |
| <cite>header</cite> and <cite>index</cite> are True, then the index names are used. A |
| sequence should be given if the object uses MultiIndex. If |
| False do not print fields for index names. Use index_label=False |
| for easier importing in R.</p></li> |
| <li><p><strong>mode</strong> (<em>{'w'</em><em>, </em><em>'x'</em><em>, </em><em>'a'}</em><em>, </em><em>default 'w'</em>) – <p>Forwarded to either <cite>open(mode=)</cite> or <cite>fsspec.open(mode=)</cite> to control |
| the file opening. Typical values include:</p> |
| <ul> |
| <li><p>’w’, truncate the file first.</p></li> |
| <li><p>’x’, exclusive creation, failing if the file already exists.</p></li> |
| <li><p>’a’, append to the end of file if it exists.</p></li> |
| </ul> |
| </p></li> |
| <li><p><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em>) – A string representing the encoding to use in the output file, |
| defaults to ‘utf-8’. <cite>encoding</cite> is not supported if <cite>path_or_buf</cite> |
| is a non-binary file object.</p></li> |
| <li><p><strong>compression</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>default 'infer'</em>) – <p>For on-the-fly compression of the output data. If ‘infer’ and ‘path_or_buf’ is |
| path-like, then detect compression from the following extensions: ‘.gz’, |
| ‘.bz2’, ‘.zip’, ‘.xz’, ‘.zst’, ‘.tar’, ‘.tar.gz’, ‘.tar.xz’ or ‘.tar.bz2’ |
| (otherwise no compression). |
| Set to <code class="docutils literal notranslate"><span class="pre">None</span></code> for no compression. |
| Can also be a dict with key <code class="docutils literal notranslate"><span class="pre">'method'</span></code> set |
| to one of {<code class="docutils literal notranslate"><span class="pre">'zip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'gzip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'bz2'</span></code>, <code class="docutils literal notranslate"><span class="pre">'zstd'</span></code>, <code class="docutils literal notranslate"><span class="pre">'xz'</span></code>, <code class="docutils literal notranslate"><span class="pre">'tar'</span></code>} and |
| other key-value pairs are forwarded to |
| <code class="docutils literal notranslate"><span class="pre">zipfile.ZipFile</span></code>, <code class="docutils literal notranslate"><span class="pre">gzip.GzipFile</span></code>, |
| <code class="docutils literal notranslate"><span class="pre">bz2.BZ2File</span></code>, <code class="docutils literal notranslate"><span class="pre">zstandard.ZstdCompressor</span></code>, <code class="docutils literal notranslate"><span class="pre">lzma.LZMAFile</span></code> or |
| <code class="docutils literal notranslate"><span class="pre">tarfile.TarFile</span></code>, respectively. |
| As an example, the following could be passed for faster compression and to create |
| a reproducible gzip archive: |
| <code class="docutils literal notranslate"><span class="pre">compression={'method':</span> <span class="pre">'gzip',</span> <span class="pre">'compresslevel':</span> <span class="pre">1,</span> <span class="pre">'mtime':</span> <span class="pre">1}</span></code>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 1.5.0: </span>Added support for <cite>.tar</cite> files.</p> |
| <p>May be a dict with key ‘method’ as compression mode |
| and other entries as additional compression options if |
| compression mode is ‘zip’.</p> |
| <p>Passing compression options as keys in dict is |
| supported for compression modes ‘gzip’, ‘bz2’, ‘zstd’, and ‘zip’.</p> |
| </div> |
| </p></li> |
| <li><p><strong>quoting</strong> (<em>optional constant from csv module</em>) – Defaults to csv.QUOTE_MINIMAL. If you have set a <cite>float_format</cite> |
| then floats are converted to strings and thus csv.QUOTE_NONNUMERIC |
| will treat them as non-numeric.</p></li> |
| <li><p><strong>quotechar</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default '"'</em>) – String of length 1. Character used to quote fields.</p></li> |
| <li><p><strong>lineterminator</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em>) – <p>The newline character or character sequence to use in the output |
| file. Defaults to <cite>os.linesep</cite>, which depends on the OS in which |
| this method is called (’\n’ for linux, ‘\r\n’ for Windows, i.e.).</p> |
| <div class="versionchanged"> |
| <p><span class="versionmodified changed">Changed in version 1.5.0: </span>Previously was line_terminator, changed for consistency with |
| read_csv and the standard library ‘csv’ module.</p> |
| </div> |
| </p></li> |
| <li><p><strong>chunksize</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em> or </em><em>None</em>) – Rows to write at a time.</p></li> |
| <li><p><strong>date_format</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default None</em>) – Format string for datetime objects.</p></li> |
| <li><p><strong>doublequote</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – Control quoting of <cite>quotechar</cite> inside a field.</p></li> |
| <li><p><strong>escapechar</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default None</em>) – String of length 1. Character used to escape <cite>sep</cite> and <cite>quotechar</cite> |
| when appropriate.</p></li> |
| <li><p><strong>decimal</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default '.'</em>) – Character recognized as decimal separator. E.g. use ‘,’ for |
| European data.</p></li> |
| <li><p><strong>errors</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default 'strict'</em>) – Specifies how encoding and decoding errors are to be handled. |
| See the errors argument for <a class="reference external" href="https://docs.python.org/3/library/functions.html#open" title="(in Python v3.13)"><code class="xref py py-func docutils literal notranslate"><span class="pre">open()</span></code></a> for a full list |
| of options.</p></li> |
| <li><p><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| </p></li> |
| </ul> |
| </dd> |
| <dt class="field-even">Returns<span class="colon">:</span></dt> |
| <dd class="field-even"><p>If path_or_buf is None, returns the resulting csv format as a |
| string. Otherwise returns None.</p> |
| </dd> |
| <dt class="field-odd">Return type<span class="colon">:</span></dt> |
| <dd class="field-odd"><p>None or <a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)">str</a></p> |
| </dd> |
| </dl> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="admonition-title">See also</p> |
| <dl class="simple"> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-obj docutils literal notranslate"><span class="pre">read_csv</span></code></a></dt><dd><p>Load a CSV file into a DeferredDataFrame.</p> |
| </dd> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.to_excel" title="apache_beam.dataframe.io.to_excel"><code class="xref py py-obj docutils literal notranslate"><span class="pre">to_excel</span></code></a></dt><dd><p>Write DeferredDataFrame to an Excel file.</p> |
| </dd> |
| </dl> |
| </div> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="go">Create 'out.csv' containing 'df' without indices</span> |
| |
| <span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">'name'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'Raphael'</span><span class="p">,</span> <span class="s1">'Donatello'</span><span class="p">],</span> |
| <span class="gp">... </span> <span class="s1">'mask'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'red'</span><span class="p">,</span> <span class="s1">'purple'</span><span class="p">],</span> |
| <span class="gp">... </span> <span class="s1">'weapon'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'sai'</span><span class="p">,</span> <span class="s1">'bo staff'</span><span class="p">]})</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="s1">'out.csv'</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| |
| <span class="go">Create 'out.zip' containing 'out.csv'</span> |
| |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| <span class="go">'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'</span> |
| <span class="gp">>>> </span><span class="n">compression_opts</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s1">'zip'</span><span class="p">,</span> |
| <span class="gp">... </span> <span class="n">archive_name</span><span class="o">=</span><span class="s1">'out.csv'</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="s1">'out.zip'</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> |
| <span class="gp">... </span> <span class="n">compression</span><span class="o">=</span><span class="n">compression_opts</span><span class="p">)</span> |
| |
| <span class="go">To write a csv file to a new folder or nested folder you will first</span> |
| <span class="go">need to create it using either Pathlib or os:</span> |
| |
| <span class="gp">>>> </span><span class="kn">from</span><span class="w"> </span><span class="nn">pathlib</span><span class="w"> </span><span class="kn">import</span> <span class="n">Path</span> |
| <span class="gp">>>> </span><span class="n">filepath</span> <span class="o">=</span> <span class="n">Path</span><span class="p">(</span><span class="s1">'folder/subfolder/out.csv'</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">filepath</span><span class="o">.</span><span class="n">parent</span><span class="o">.</span><span class="n">mkdir</span><span class="p">(</span><span class="n">parents</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">exist_ok</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="n">filepath</span><span class="p">)</span> |
| |
| <span class="gp">>>> </span><span class="kn">import</span><span class="w"> </span><span class="nn">os</span> |
| <span class="gp">>>> </span><span class="n">os</span><span class="o">.</span><span class="n">makedirs</span><span class="p">(</span><span class="s1">'folder/subfolder'</span><span class="p">,</span> <span class="n">exist_ok</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="s1">'folder/subfolder/out.csv'</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="py function"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.read_fwf"> |
| <span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">read_fwf</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#read_fwf"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#apache_beam.dataframe.io.read_fwf" title="Link to this definition"></a></dt> |
| <dd><p>Read a table of fixed-width formatted lines into DataFrame.</p> |
| <p>Also supports optionally iterating or breaking of the file |
| into chunks.</p> |
| <p>Additional help can be found in the <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html">online docs for IO Tools</a>.</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><ul class="simple"> |
| <li><p><strong>filepath_or_buffer</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>path object</em><em>, or </em><em>file-like object</em>) – String, path object (implementing <code class="docutils literal notranslate"><span class="pre">os.PathLike[str]</span></code>), or file-like |
| object implementing a text <code class="docutils literal notranslate"><span class="pre">read()</span></code> function.The string could be a URL. |
| Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is |
| expected. A local file could be: |
| <code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.csv</span></code>.</p></li> |
| <li><p><strong>colspecs</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em> of </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><em>tuple</em></a><em> (</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>) or </em><em>'infer'. optional</em>) – A list of tuples giving the extents of the fixed-width |
| fields of each line as half-open intervals (i.e., [from, to[ ). |
| String value ‘infer’ can be used to instruct the parser to try |
| detecting the column specifications from the first 100 rows of |
| the data which are not being skipped via skiprows (default=’infer’).</p></li> |
| <li><p><strong>widths</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em> of </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – A list of field widths which can be used instead of ‘colspecs’ if |
| the intervals are contiguous.</p></li> |
| <li><p><strong>infer_nrows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>default 100</em>) – The number of rows to consider when letting the parser determine the |
| <cite>colspecs</cite>.</p></li> |
| <li><p><strong>dtype_backend</strong> (<em>{'numpy_nullable'</em><em>, </em><em>'pyarrow'}</em><em>, </em><em>default 'numpy_nullable'</em>) – <p>Back-end data type applied to the resultant <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> |
| (still experimental). Behaviour is as follows:</p> |
| <ul> |
| <li><p><code class="docutils literal notranslate"><span class="pre">"numpy_nullable"</span></code>: returns nullable-dtype-backed <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> |
| (default).</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">"pyarrow"</span></code>: returns pyarrow-backed nullable <code class="xref py py-class docutils literal notranslate"><span class="pre">ArrowDtype</span></code> |
| DeferredDataFrame.</p></li> |
| </ul> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 2.0.</span></p> |
| </div> |
| </p></li> |
| <li><p><strong>**kwds</strong> (<em>optional</em>) – Optional keyword arguments can be passed to <code class="docutils literal notranslate"><span class="pre">TextFileReader</span></code>.</p></li> |
| </ul> |
| </dd> |
| <dt class="field-even">Returns<span class="colon">:</span></dt> |
| <dd class="field-even"><p>A comma-separated values (csv) file is returned as two-dimensional |
| data structure with labeled axes.</p> |
| </dd> |
| <dt class="field-odd">Return type<span class="colon">:</span></dt> |
| <dd class="field-odd"><p><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a> or TextFileReader</p> |
| </dd> |
| </dl> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="admonition-title">See also</p> |
| <dl class="simple"> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">DeferredDataFrame.to_csv</span></code></dt><dd><p>Write DeferredDataFrame to a comma-separated values (csv) file.</p> |
| </dd> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-obj docutils literal notranslate"><span class="pre">read_csv</span></code></a></dt><dd><p>Read a comma-separated values (csv) file into DeferredDataFrame.</p> |
| </dd> |
| </dl> |
| </div> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_fwf</span><span class="p">(</span><span class="s1">'data.csv'</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="py function"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.read_json"> |
| <span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">read_json</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#read_json"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#apache_beam.dataframe.io.read_json" title="Link to this definition"></a></dt> |
| <dd><p>Convert a JSON string to pandas object.</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><ul class="simple"> |
| <li><p><strong>path_or_buf</strong> (<em>a valid JSON str</em><em>, </em><em>path object</em><em> or </em><em>file-like object</em>) – <p>Any valid string path is acceptable. The string could be a URL. Valid |
| URL schemes include http, ftp, s3, and file. For file URLs, a host is |
| expected. A local file could be: |
| <code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.json</span></code>.</p> |
| <p>If you want to pass in a path object, pandas accepts any |
| <code class="docutils literal notranslate"><span class="pre">os.PathLike</span></code>.</p> |
| <p>By file-like object, we refer to objects with a <code class="docutils literal notranslate"><span class="pre">read()</span></code> method, |
| such as a file handle (e.g. via builtin <code class="docutils literal notranslate"><span class="pre">open</span></code> function) |
| or <code class="docutils literal notranslate"><span class="pre">StringIO</span></code>.</p> |
| <div class="deprecated"> |
| <p><span class="versionmodified deprecated">Deprecated since version 2.1.0: </span>Passing json literal strings is deprecated.</p> |
| </div> |
| </p></li> |
| <li><p><strong>orient</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em>) – <p>Indication of expected JSON string format. |
| Compatible JSON strings can be produced by <code class="docutils literal notranslate"><span class="pre">to_json()</span></code> with a |
| corresponding orient value. |
| The set of possible orients is:</p> |
| <ul> |
| <li><p><code class="docutils literal notranslate"><span class="pre">'split'</span></code> : dict like |
| <code class="docutils literal notranslate"><span class="pre">{index</span> <span class="pre">-></span> <span class="pre">[index],</span> <span class="pre">columns</span> <span class="pre">-></span> <span class="pre">[columns],</span> <span class="pre">data</span> <span class="pre">-></span> <span class="pre">[values]}</span></code></p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">'records'</span></code> : list like |
| <code class="docutils literal notranslate"><span class="pre">[{column</span> <span class="pre">-></span> <span class="pre">value},</span> <span class="pre">...</span> <span class="pre">,</span> <span class="pre">{column</span> <span class="pre">-></span> <span class="pre">value}]</span></code></p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">'index'</span></code> : dict like <code class="docutils literal notranslate"><span class="pre">{index</span> <span class="pre">-></span> <span class="pre">{column</span> <span class="pre">-></span> <span class="pre">value}}</span></code></p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">'columns'</span></code> : dict like <code class="docutils literal notranslate"><span class="pre">{column</span> <span class="pre">-></span> <span class="pre">{index</span> <span class="pre">-></span> <span class="pre">value}}</span></code></p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">'values'</span></code> : just the values array</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">'table'</span></code> : dict like <code class="docutils literal notranslate"><span class="pre">{'schema':</span> <span class="pre">{schema},</span> <span class="pre">'data':</span> <span class="pre">{data}}</span></code></p></li> |
| </ul> |
| <p>The allowed and default values depend on the value |
| of the <cite>typ</cite> parameter.</p> |
| <ul> |
| <li><p>when <code class="docutils literal notranslate"><span class="pre">typ</span> <span class="pre">==</span> <span class="pre">'series'</span></code>,</p> |
| <ul> |
| <li><p>allowed orients are <code class="docutils literal notranslate"><span class="pre">{'split','records','index'}</span></code></p></li> |
| <li><p>default is <code class="docutils literal notranslate"><span class="pre">'index'</span></code></p></li> |
| <li><p>The DeferredSeries index must be unique for orient <code class="docutils literal notranslate"><span class="pre">'index'</span></code>.</p></li> |
| </ul> |
| </li> |
| <li><p>when <code class="docutils literal notranslate"><span class="pre">typ</span> <span class="pre">==</span> <span class="pre">'frame'</span></code>,</p> |
| <ul> |
| <li><p>allowed orients are <code class="docutils literal notranslate"><span class="pre">{'split','records','index',</span> |
| <span class="pre">'columns','values',</span> <span class="pre">'table'}</span></code></p></li> |
| <li><p>default is <code class="docutils literal notranslate"><span class="pre">'columns'</span></code></p></li> |
| <li><p>The DeferredDataFrame index must be unique for orients <code class="docutils literal notranslate"><span class="pre">'index'</span></code> and |
| <code class="docutils literal notranslate"><span class="pre">'columns'</span></code>.</p></li> |
| <li><p>The DeferredDataFrame columns must be unique for orients <code class="docutils literal notranslate"><span class="pre">'index'</span></code>, |
| <code class="docutils literal notranslate"><span class="pre">'columns'</span></code>, and <code class="docutils literal notranslate"><span class="pre">'records'</span></code>.</p></li> |
| </ul> |
| </li> |
| </ul> |
| </p></li> |
| <li><p><strong>typ</strong> (<em>{'frame'</em><em>, </em><em>'series'}</em><em>, </em><em>default 'frame'</em>) – The type of object to recover.</p></li> |
| <li><p><strong>dtype</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>default None</em>) – <p>If True, infer dtypes; if a dict of column to dtype, then use those; |
| if False, then don’t infer dtypes at all, applies only to the data.</p> |
| <p>For all <code class="docutils literal notranslate"><span class="pre">orient</span></code> values except <code class="docutils literal notranslate"><span class="pre">'table'</span></code>, default is True.</p> |
| </p></li> |
| <li><p><strong>convert_axes</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default None</em>) – <p>Try to convert the axes to the proper dtypes.</p> |
| <p>For all <code class="docutils literal notranslate"><span class="pre">orient</span></code> values except <code class="docutils literal notranslate"><span class="pre">'table'</span></code>, default is True.</p> |
| </p></li> |
| <li><p><strong>convert_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em> of </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default True</em>) – If True then default datelike columns may be converted (depending on |
| keep_default_dates). |
| If False, no dates will be converted. |
| If a list of column names, then those columns will be converted and |
| default datelike columns may also be converted (depending on |
| keep_default_dates).</p></li> |
| <li><p><strong>keep_default_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – <p>If parsing dates (convert_dates is not False), then try to parse the |
| default datelike columns. |
| A column label is datelike if</p> |
| <ul> |
| <li><p>it ends with <code class="docutils literal notranslate"><span class="pre">'_at'</span></code>,</p></li> |
| <li><p>it ends with <code class="docutils literal notranslate"><span class="pre">'_time'</span></code>,</p></li> |
| <li><p>it begins with <code class="docutils literal notranslate"><span class="pre">'timestamp'</span></code>,</p></li> |
| <li><p>it is <code class="docutils literal notranslate"><span class="pre">'modified'</span></code>, or</p></li> |
| <li><p>it is <code class="docutils literal notranslate"><span class="pre">'date'</span></code>.</p></li> |
| </ul> |
| </p></li> |
| <li><p><strong>precise_float</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default False</em>) – Set to enable usage of higher precision (strtod) function when |
| decoding string to double values. Default (False) is to use fast but |
| less precise builtin functionality.</p></li> |
| <li><p><strong>date_unit</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default None</em>) – The timestamp unit to detect if converting dates. The default behaviour |
| is to try and detect the correct precision, but if this is not desired |
| then pass one of ‘s’, ‘ms’, ‘us’ or ‘ns’ to force parsing only seconds, |
| milliseconds, microseconds or nanoseconds respectively.</p></li> |
| <li><p><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default is 'utf-8'</em>) – The encoding to use to decode py3 bytes.</p></li> |
| <li><p><strong>encoding_errors</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em><em>, </em><em>default "strict"</em>) – <p>How encoding errors are treated. <a class="reference external" href="https://docs.python.org/3/library/codecs.html#error-handlers">List of possible values</a> .</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 1.3.0.</span></p> |
| </div> |
| </p></li> |
| <li><p><strong>lines</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default False</em>) – Read the file as a json object per line.</p></li> |
| <li><p><strong>chunksize</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – Return JsonReader object for iteration. |
| See the <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#line-delimited-json">line-delimited json docs</a> |
| for more information on <code class="docutils literal notranslate"><span class="pre">chunksize</span></code>. |
| This can only be passed if <cite>lines=True</cite>. |
| If this is None, the file will be read into memory all at once.</p></li> |
| <li><p><strong>compression</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>default 'infer'</em>) – <p>For on-the-fly decompression of on-disk data. If ‘infer’ and ‘path_or_buf’ is |
| path-like, then detect compression from the following extensions: ‘.gz’, |
| ‘.bz2’, ‘.zip’, ‘.xz’, ‘.zst’, ‘.tar’, ‘.tar.gz’, ‘.tar.xz’ or ‘.tar.bz2’ |
| (otherwise no compression). |
| If using ‘zip’ or ‘tar’, the ZIP file must contain only one data file to be read in. |
| Set to <code class="docutils literal notranslate"><span class="pre">None</span></code> for no decompression. |
| Can also be a dict with key <code class="docutils literal notranslate"><span class="pre">'method'</span></code> set |
| to one of {<code class="docutils literal notranslate"><span class="pre">'zip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'gzip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'bz2'</span></code>, <code class="docutils literal notranslate"><span class="pre">'zstd'</span></code>, <code class="docutils literal notranslate"><span class="pre">'xz'</span></code>, <code class="docutils literal notranslate"><span class="pre">'tar'</span></code>} and |
| other key-value pairs are forwarded to |
| <code class="docutils literal notranslate"><span class="pre">zipfile.ZipFile</span></code>, <code class="docutils literal notranslate"><span class="pre">gzip.GzipFile</span></code>, |
| <code class="docutils literal notranslate"><span class="pre">bz2.BZ2File</span></code>, <code class="docutils literal notranslate"><span class="pre">zstandard.ZstdDecompressor</span></code>, <code class="docutils literal notranslate"><span class="pre">lzma.LZMAFile</span></code> or |
| <code class="docutils literal notranslate"><span class="pre">tarfile.TarFile</span></code>, respectively. |
| As an example, the following could be passed for Zstandard decompression using a |
| custom compression dictionary: |
| <code class="docutils literal notranslate"><span class="pre">compression={'method':</span> <span class="pre">'zstd',</span> <span class="pre">'dict_data':</span> <span class="pre">my_compression_dict}</span></code>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 1.5.0: </span>Added support for <cite>.tar</cite> files.</p> |
| </div> |
| <div class="versionchanged"> |
| <p><span class="versionmodified changed">Changed in version 1.4.0: </span>Zstandard support.</p> |
| </div> |
| </p></li> |
| <li><p><strong>nrows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – The number of lines from the line-delimited jsonfile that has to be read. |
| This can only be passed if <cite>lines=True</cite>. |
| If this is None, all the rows will be returned.</p></li> |
| <li><p><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| </p></li> |
| <li><p><strong>dtype_backend</strong> (<em>{'numpy_nullable'</em><em>, </em><em>'pyarrow'}</em><em>, </em><em>default 'numpy_nullable'</em>) – <p>Back-end data type applied to the resultant <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> |
| (still experimental). Behaviour is as follows:</p> |
| <ul> |
| <li><p><code class="docutils literal notranslate"><span class="pre">"numpy_nullable"</span></code>: returns nullable-dtype-backed <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> |
| (default).</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">"pyarrow"</span></code>: returns pyarrow-backed nullable <code class="xref py py-class docutils literal notranslate"><span class="pre">ArrowDtype</span></code> |
| DeferredDataFrame.</p></li> |
| </ul> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 2.0.</span></p> |
| </div> |
| </p></li> |
| <li><p><strong>engine</strong> (<em>{"ujson"</em><em>, </em><em>"pyarrow"}</em><em>, </em><em>default "ujson"</em>) – <p>Parser engine to use. The <code class="docutils literal notranslate"><span class="pre">"pyarrow"</span></code> engine is only available when |
| <code class="docutils literal notranslate"><span class="pre">lines=True</span></code>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 2.0.</span></p> |
| </div> |
| </p></li> |
| </ul> |
| </dd> |
| <dt class="field-even">Returns<span class="colon">:</span></dt> |
| <dd class="field-even"><p>A JsonReader is returned when <code class="docutils literal notranslate"><span class="pre">chunksize</span></code> is not <code class="docutils literal notranslate"><span class="pre">0</span></code> or <code class="docutils literal notranslate"><span class="pre">None</span></code>. |
| Otherwise, the type returned depends on the value of <code class="docutils literal notranslate"><span class="pre">typ</span></code>.</p> |
| </dd> |
| <dt class="field-odd">Return type<span class="colon">:</span></dt> |
| <dd class="field-odd"><p><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredSeries" title="apache_beam.dataframe.frames.DeferredSeries">DeferredSeries</a>, <a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a>, or pandas.api.typing.JsonReader</p> |
| </dd> |
| </dl> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="admonition-title">See also</p> |
| <dl class="simple"> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">DeferredDataFrame.to_json</span></code></dt><dd><p>Convert a DeferredDataFrame to a JSON string.</p> |
| </dd> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">DeferredSeries.to_json</span></code></dt><dd><p>Convert a DeferredSeries to a JSON string.</p> |
| </dd> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">json_normalize</span></code></dt><dd><p>Normalize semi-structured JSON data into a flat table.</p> |
| </dd> |
| </dl> |
| </div> |
| <p class="rubric">Notes</p> |
| <p>Specific to <code class="docutils literal notranslate"><span class="pre">orient='table'</span></code>, if a <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> with a literal |
| <code class="xref py py-class docutils literal notranslate"><span class="pre">Index</span></code> name of <cite>index</cite> gets written with <a class="reference internal" href="#apache_beam.dataframe.io.to_json" title="apache_beam.dataframe.io.to_json"><code class="xref py py-func docutils literal notranslate"><span class="pre">to_json()</span></code></a>, the |
| subsequent read operation will incorrectly set the <code class="xref py py-class docutils literal notranslate"><span class="pre">Index</span></code> name to |
| <code class="docutils literal notranslate"><span class="pre">None</span></code>. This is because <cite>index</cite> is also used by <code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_json()</span></code> |
| to denote a missing <code class="xref py py-class docutils literal notranslate"><span class="pre">Index</span></code> name, and the subsequent |
| <a class="reference internal" href="#apache_beam.dataframe.io.read_json" title="apache_beam.dataframe.io.read_json"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_json()</span></code></a> operation cannot distinguish between the two. The same |
| limitation is encountered with a <code class="xref py py-class docutils literal notranslate"><span class="pre">MultiIndex</span></code> and any names |
| beginning with <code class="docutils literal notranslate"><span class="pre">'level_'</span></code>.</p> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="kn">from</span><span class="w"> </span><span class="nn">io</span><span class="w"> </span><span class="kn">import</span> <span class="n">StringIO</span> |
| <span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">([[</span><span class="s1">'a'</span><span class="p">,</span> <span class="s1">'b'</span><span class="p">],</span> <span class="p">[</span><span class="s1">'c'</span><span class="p">,</span> <span class="s1">'d'</span><span class="p">]],</span> |
| <span class="gp">... </span> <span class="n">index</span><span class="o">=</span><span class="p">[</span><span class="s1">'row 1'</span><span class="p">,</span> <span class="s1">'row 2'</span><span class="p">],</span> |
| <span class="gp">... </span> <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">'col 1'</span><span class="p">,</span> <span class="s1">'col 2'</span><span class="p">])</span> |
| |
| <span class="go">Encoding/decoding a Dataframe using ``'split'`` formatted JSON:</span> |
| |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s1">'split'</span><span class="p">)</span> |
| <span class="go"> '{"columns":["col 1","col 2"],"index":["row 1","row 2"],"data":[["a","b"],["c","d"]]}'</span> |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_json</span><span class="p">(</span><span class="n">StringIO</span><span class="p">(</span><span class="n">_</span><span class="p">),</span> <span class="n">orient</span><span class="o">=</span><span class="s1">'split'</span><span class="p">)</span> |
| <span class="go"> col 1 col 2</span> |
| <span class="go">row 1 a b</span> |
| <span class="go">row 2 c d</span> |
| |
| <span class="go">Encoding/decoding a Dataframe using ``'index'`` formatted JSON:</span> |
| |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s1">'index'</span><span class="p">)</span> |
| <span class="go">'{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'</span> |
| |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_json</span><span class="p">(</span><span class="n">StringIO</span><span class="p">(</span><span class="n">_</span><span class="p">),</span> <span class="n">orient</span><span class="o">=</span><span class="s1">'index'</span><span class="p">)</span> |
| <span class="go"> col 1 col 2</span> |
| <span class="go">row 1 a b</span> |
| <span class="go">row 2 c d</span> |
| |
| <span class="go">Encoding/decoding a Dataframe using ``'records'`` formatted JSON.</span> |
| <span class="go">Note that index labels are not preserved with this encoding.</span> |
| |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s1">'records'</span><span class="p">)</span> |
| <span class="go">'[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'</span> |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_json</span><span class="p">(</span><span class="n">StringIO</span><span class="p">(</span><span class="n">_</span><span class="p">),</span> <span class="n">orient</span><span class="o">=</span><span class="s1">'records'</span><span class="p">)</span> |
| <span class="go"> col 1 col 2</span> |
| <span class="go">0 a b</span> |
| <span class="go">1 c d</span> |
| |
| <span class="go">Encoding with Table Schema</span> |
| |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s1">'table'</span><span class="p">)</span> |
| <span class="go"> '{"schema":{"fields":[{"name":"index","type":"string"},{"name":"col 1","type":"string"},{"name":"col 2","type":"string"}],"primaryKey":["index"],"pandas_version":"1.4.0"},"data":[{"index":"row 1","col 1":"a","col 2":"b"},{"index":"row 2","col 1":"c","col 2":"d"}]}'</span> |
| |
| <span class="go">The following example uses ``dtype_backend="numpy_nullable"``</span> |
| |
| <span class="gp">>>> </span><span class="n">data</span> <span class="o">=</span> <span class="s1">'''{"index": {"0": 0, "1": 1},</span> |
| <span class="gp">... </span><span class="s1"> "a": {"0": 1, "1": null},</span> |
| <span class="gp">... </span><span class="s1"> "b": {"0": 2.5, "1": 4.5},</span> |
| <span class="gp">... </span><span class="s1"> "c": {"0": true, "1": false},</span> |
| <span class="gp">... </span><span class="s1"> "d": {"0": "a", "1": "b"},</span> |
| <span class="gp">... </span><span class="s1"> "e": {"0": 1577.2, "1": 1577.1}}'''</span> |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_json</span><span class="p">(</span><span class="n">StringIO</span><span class="p">(</span><span class="n">data</span><span class="p">),</span> <span class="n">dtype_backend</span><span class="o">=</span><span class="s2">"numpy_nullable"</span><span class="p">)</span> |
| <span class="go"> index a b c d e</span> |
| <span class="go">0 0 1 2.5 True a 1577.2</span> |
| <span class="go">1 1 <NA> 4.5 False b 1577.1</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="py function"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.to_json"> |
| <span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">to_json</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">df</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">orient</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#to_json"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#apache_beam.dataframe.io.to_json" title="Link to this definition"></a></dt> |
| <dd><p>Convert the object to a JSON string.</p> |
| <p>Note NaN’s and None will be converted to null and datetime objects |
| will be converted to UNIX timestamps.</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><ul class="simple"> |
| <li><p><strong>path_or_buf</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>path object</em><em>, </em><em>file-like object</em><em>, or </em><em>None</em><em>, </em><em>default None</em>) – String, path object (implementing os.PathLike[str]), or file-like |
| object implementing a write() function. If None, the result is |
| returned as a string.</p></li> |
| <li><p><strong>orient</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – <p>Indication of expected JSON string format.</p> |
| <ul> |
| <li><p>DeferredSeries:</p> |
| <blockquote> |
| <div><ul class="simple"> |
| <li><p>default is ‘index’</p></li> |
| <li><p>allowed values are: {‘split’, ‘records’, ‘index’, ‘table’}.</p></li> |
| </ul> |
| </div></blockquote> |
| </li> |
| <li><p>DeferredDataFrame:</p> |
| <blockquote> |
| <div><ul class="simple"> |
| <li><p>default is ‘columns’</p></li> |
| <li><p>allowed values are: {‘split’, ‘records’, ‘index’, ‘columns’, |
| ‘values’, ‘table’}.</p></li> |
| </ul> |
| </div></blockquote> |
| </li> |
| <li><p>The format of the JSON string:</p> |
| <blockquote> |
| <div><ul class="simple"> |
| <li><p>’split’ : dict like {‘index’ -> [index], ‘columns’ -> [columns], |
| ‘data’ -> [values]}</p></li> |
| <li><p>’records’ : list like [{column -> value}, … , {column -> value}]</p></li> |
| <li><p>’index’ : dict like {index -> {column -> value}}</p></li> |
| <li><p>’columns’ : dict like {column -> {index -> value}}</p></li> |
| <li><p>’values’ : just the values array</p></li> |
| <li><p>’table’ : dict like {‘schema’: {schema}, ‘data’: {data}}</p></li> |
| </ul> |
| <p>Describing the data, where data component is like <code class="docutils literal notranslate"><span class="pre">orient='records'</span></code>.</p> |
| </div></blockquote> |
| </li> |
| </ul> |
| </p></li> |
| <li><p><strong>date_format</strong> (<em>{None</em><em>, </em><em>'epoch'</em><em>, </em><em>'iso'}</em>) – Type of date conversion. ‘epoch’ = epoch milliseconds, |
| ‘iso’ = ISO8601. The default depends on the <cite>orient</cite>. For |
| <code class="docutils literal notranslate"><span class="pre">orient='table'</span></code>, the default is ‘iso’. For all other orients, |
| the default is ‘epoch’.</p></li> |
| <li><p><strong>double_precision</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>default 10</em>) – The number of decimal places to use when encoding |
| floating point values. The possible maximal value is 15. |
| Passing double_precision greater than 15 will raise a ValueError.</p></li> |
| <li><p><strong>force_ascii</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – Force encoded string to be ASCII.</p></li> |
| <li><p><strong>date_unit</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default 'ms'</em><em> (</em><em>milliseconds</em><em>)</em>) – The time unit to encode to, governs timestamp and ISO8601 |
| precision. One of ‘s’, ‘ms’, ‘us’, ‘ns’ for second, millisecond, |
| microsecond, and nanosecond respectively.</p></li> |
| <li><p><strong>default_handler</strong> (<em>callable</em><em>, </em><em>default None</em>) – Handler to call if object cannot otherwise be converted to a |
| suitable format for JSON. Should receive a single argument which is |
| the object to convert and return a serialisable object.</p></li> |
| <li><p><strong>lines</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default False</em>) – If ‘orient’ is ‘records’ write out line-delimited json format. Will |
| throw ValueError if incorrect ‘orient’ since others are not |
| list-like.</p></li> |
| <li><p><strong>compression</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>default 'infer'</em>) – <p>For on-the-fly compression of the output data. If ‘infer’ and ‘path_or_buf’ is |
| path-like, then detect compression from the following extensions: ‘.gz’, |
| ‘.bz2’, ‘.zip’, ‘.xz’, ‘.zst’, ‘.tar’, ‘.tar.gz’, ‘.tar.xz’ or ‘.tar.bz2’ |
| (otherwise no compression). |
| Set to <code class="docutils literal notranslate"><span class="pre">None</span></code> for no compression. |
| Can also be a dict with key <code class="docutils literal notranslate"><span class="pre">'method'</span></code> set |
| to one of {<code class="docutils literal notranslate"><span class="pre">'zip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'gzip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'bz2'</span></code>, <code class="docutils literal notranslate"><span class="pre">'zstd'</span></code>, <code class="docutils literal notranslate"><span class="pre">'xz'</span></code>, <code class="docutils literal notranslate"><span class="pre">'tar'</span></code>} and |
| other key-value pairs are forwarded to |
| <code class="docutils literal notranslate"><span class="pre">zipfile.ZipFile</span></code>, <code class="docutils literal notranslate"><span class="pre">gzip.GzipFile</span></code>, |
| <code class="docutils literal notranslate"><span class="pre">bz2.BZ2File</span></code>, <code class="docutils literal notranslate"><span class="pre">zstandard.ZstdCompressor</span></code>, <code class="docutils literal notranslate"><span class="pre">lzma.LZMAFile</span></code> or |
| <code class="docutils literal notranslate"><span class="pre">tarfile.TarFile</span></code>, respectively. |
| As an example, the following could be passed for faster compression and to create |
| a reproducible gzip archive: |
| <code class="docutils literal notranslate"><span class="pre">compression={'method':</span> <span class="pre">'gzip',</span> <span class="pre">'compresslevel':</span> <span class="pre">1,</span> <span class="pre">'mtime':</span> <span class="pre">1}</span></code>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 1.5.0: </span>Added support for <cite>.tar</cite> files.</p> |
| </div> |
| <div class="versionchanged"> |
| <p><span class="versionmodified changed">Changed in version 1.4.0: </span>Zstandard support.</p> |
| </div> |
| </p></li> |
| <li><p><strong>index</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em> or </em><em>None</em><em>, </em><em>default None</em>) – The index is only used when ‘orient’ is ‘split’, ‘index’, ‘column’, |
| or ‘table’. Of these, ‘index’ and ‘column’ do not support |
| <cite>index=False</cite>.</p></li> |
| <li><p><strong>indent</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – Length of whitespace used to indent each record.</p></li> |
| <li><p><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| </p></li> |
| <li><p><strong>mode</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default 'w'</em><em> (</em><em>writing</em><em>)</em>) – Specify the IO mode for output when supplying a path_or_buf. |
| Accepted args are ‘w’ (writing) and ‘a’ (append) only. |
| mode=’a’ is only supported when lines is True and orient is ‘records’.</p></li> |
| </ul> |
| </dd> |
| <dt class="field-even">Returns<span class="colon">:</span></dt> |
| <dd class="field-even"><p>If path_or_buf is None, returns the resulting json format as a |
| string. Otherwise returns None.</p> |
| </dd> |
| <dt class="field-odd">Return type<span class="colon">:</span></dt> |
| <dd class="field-odd"><p>None or <a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)">str</a></p> |
| </dd> |
| </dl> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="admonition-title">See also</p> |
| <dl class="simple"> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_json" title="apache_beam.dataframe.io.read_json"><code class="xref py py-obj docutils literal notranslate"><span class="pre">read_json</span></code></a></dt><dd><p>Convert a JSON string to pandas object.</p> |
| </dd> |
| </dl> |
| </div> |
| <p class="rubric">Notes</p> |
| <p>The behavior of <code class="docutils literal notranslate"><span class="pre">indent=0</span></code> varies from the stdlib, which does not |
| indent the output but does insert newlines. Currently, <code class="docutils literal notranslate"><span class="pre">indent=0</span></code> |
| and the default <code class="docutils literal notranslate"><span class="pre">indent=None</span></code> are equivalent in pandas, though this |
| may change in a future release.</p> |
| <p><code class="docutils literal notranslate"><span class="pre">orient='table'</span></code> contains a ‘pandas_version’ field under ‘schema’. |
| This stores the version of <cite>pandas</cite> used in the latest revision of the |
| schema.</p> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="kn">from</span><span class="w"> </span><span class="nn">json</span><span class="w"> </span><span class="kn">import</span> <span class="n">loads</span><span class="p">,</span> <span class="n">dumps</span> |
| <span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span> |
| <span class="gp">... </span> <span class="p">[[</span><span class="s2">"a"</span><span class="p">,</span> <span class="s2">"b"</span><span class="p">],</span> <span class="p">[</span><span class="s2">"c"</span><span class="p">,</span> <span class="s2">"d"</span><span class="p">]],</span> |
| <span class="gp">... </span> <span class="n">index</span><span class="o">=</span><span class="p">[</span><span class="s2">"row 1"</span><span class="p">,</span> <span class="s2">"row 2"</span><span class="p">],</span> |
| <span class="gp">... </span> <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s2">"col 1"</span><span class="p">,</span> <span class="s2">"col 2"</span><span class="p">],</span> |
| <span class="gp">... </span><span class="p">)</span> |
| |
| <span class="gp">>>> </span><span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">"split"</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">parsed</span> <span class="o">=</span> <span class="n">loads</span><span class="p">(</span><span class="n">result</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">dumps</span><span class="p">(</span><span class="n">parsed</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span> |
| <span class="go">{</span> |
| <span class="go"> "columns": [</span> |
| <span class="go"> "col 1",</span> |
| <span class="go"> "col 2"</span> |
| <span class="go"> ],</span> |
| <span class="go"> "index": [</span> |
| <span class="go"> "row 1",</span> |
| <span class="go"> "row 2"</span> |
| <span class="go"> ],</span> |
| <span class="go"> "data": [</span> |
| <span class="go"> [</span> |
| <span class="go"> "a",</span> |
| <span class="go"> "b"</span> |
| <span class="go"> ],</span> |
| <span class="go"> [</span> |
| <span class="go"> "c",</span> |
| <span class="go"> "d"</span> |
| <span class="go"> ]</span> |
| <span class="go"> ]</span> |
| <span class="go">}</span> |
| |
| <span class="go">Encoding/decoding a Dataframe using ``'records'`` formatted JSON.</span> |
| <span class="go">Note that index labels are not preserved with this encoding.</span> |
| |
| <span class="gp">>>> </span><span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">"records"</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">parsed</span> <span class="o">=</span> <span class="n">loads</span><span class="p">(</span><span class="n">result</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">dumps</span><span class="p">(</span><span class="n">parsed</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span> |
| <span class="go">[</span> |
| <span class="go"> {</span> |
| <span class="go"> "col 1": "a",</span> |
| <span class="go"> "col 2": "b"</span> |
| <span class="go"> },</span> |
| <span class="go"> {</span> |
| <span class="go"> "col 1": "c",</span> |
| <span class="go"> "col 2": "d"</span> |
| <span class="go"> }</span> |
| <span class="go">]</span> |
| |
| <span class="go">Encoding/decoding a Dataframe using ``'index'`` formatted JSON:</span> |
| |
| <span class="gp">>>> </span><span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">"index"</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">parsed</span> <span class="o">=</span> <span class="n">loads</span><span class="p">(</span><span class="n">result</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">dumps</span><span class="p">(</span><span class="n">parsed</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span> |
| <span class="go">{</span> |
| <span class="go"> "row 1": {</span> |
| <span class="go"> "col 1": "a",</span> |
| <span class="go"> "col 2": "b"</span> |
| <span class="go"> },</span> |
| <span class="go"> "row 2": {</span> |
| <span class="go"> "col 1": "c",</span> |
| <span class="go"> "col 2": "d"</span> |
| <span class="go"> }</span> |
| <span class="go">}</span> |
| |
| <span class="go">Encoding/decoding a Dataframe using ``'columns'`` formatted JSON:</span> |
| |
| <span class="gp">>>> </span><span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">"columns"</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">parsed</span> <span class="o">=</span> <span class="n">loads</span><span class="p">(</span><span class="n">result</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">dumps</span><span class="p">(</span><span class="n">parsed</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span> |
| <span class="go">{</span> |
| <span class="go"> "col 1": {</span> |
| <span class="go"> "row 1": "a",</span> |
| <span class="go"> "row 2": "c"</span> |
| <span class="go"> },</span> |
| <span class="go"> "col 2": {</span> |
| <span class="go"> "row 1": "b",</span> |
| <span class="go"> "row 2": "d"</span> |
| <span class="go"> }</span> |
| <span class="go">}</span> |
| |
| <span class="go">Encoding/decoding a Dataframe using ``'values'`` formatted JSON:</span> |
| |
| <span class="gp">>>> </span><span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">"values"</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">parsed</span> <span class="o">=</span> <span class="n">loads</span><span class="p">(</span><span class="n">result</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">dumps</span><span class="p">(</span><span class="n">parsed</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span> |
| <span class="go">[</span> |
| <span class="go"> [</span> |
| <span class="go"> "a",</span> |
| <span class="go"> "b"</span> |
| <span class="go"> ],</span> |
| <span class="go"> [</span> |
| <span class="go"> "c",</span> |
| <span class="go"> "d"</span> |
| <span class="go"> ]</span> |
| <span class="go">]</span> |
| |
| <span class="go">Encoding with Table Schema:</span> |
| |
| <span class="gp">>>> </span><span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">"table"</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">parsed</span> <span class="o">=</span> <span class="n">loads</span><span class="p">(</span><span class="n">result</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">dumps</span><span class="p">(</span><span class="n">parsed</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span> |
| <span class="go">{</span> |
| <span class="go"> "schema": {</span> |
| <span class="go"> "fields": [</span> |
| <span class="go"> {</span> |
| <span class="go"> "name": "index",</span> |
| <span class="go"> "type": "string"</span> |
| <span class="go"> },</span> |
| <span class="go"> {</span> |
| <span class="go"> "name": "col 1",</span> |
| <span class="go"> "type": "string"</span> |
| <span class="go"> },</span> |
| <span class="go"> {</span> |
| <span class="go"> "name": "col 2",</span> |
| <span class="go"> "type": "string"</span> |
| <span class="go"> }</span> |
| <span class="go"> ],</span> |
| <span class="go"> "primaryKey": [</span> |
| <span class="go"> "index"</span> |
| <span class="go"> ],</span> |
| <span class="go"> "pandas_version": "1.4.0"</span> |
| <span class="go"> },</span> |
| <span class="go"> "data": [</span> |
| <span class="go"> {</span> |
| <span class="go"> "index": "row 1",</span> |
| <span class="go"> "col 1": "a",</span> |
| <span class="go"> "col 2": "b"</span> |
| <span class="go"> },</span> |
| <span class="go"> {</span> |
| <span class="go"> "index": "row 2",</span> |
| <span class="go"> "col 1": "c",</span> |
| <span class="go"> "col 2": "d"</span> |
| <span class="go"> }</span> |
| <span class="go"> ]</span> |
| <span class="go">}</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="py function"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.read_html"> |
| <span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">read_html</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#read_html"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#apache_beam.dataframe.io.read_html" title="Link to this definition"></a></dt> |
| <dd><p>Read HTML tables into a <code class="docutils literal notranslate"><span class="pre">list</span></code> of <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code> objects.</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><ul class="simple"> |
| <li><p><strong>io</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>path object</em><em>, or </em><em>file-like object</em>) – <p>String, path object (implementing <code class="docutils literal notranslate"><span class="pre">os.PathLike[str]</span></code>), or file-like |
| object implementing a string <code class="docutils literal notranslate"><span class="pre">read()</span></code> function. |
| The string can represent a URL or the HTML itself. Note that |
| lxml only accepts the http, ftp and file url protocols. If you have a |
| URL that starts with <code class="docutils literal notranslate"><span class="pre">'https'</span></code> you might try removing the <code class="docutils literal notranslate"><span class="pre">'s'</span></code>.</p> |
| <div class="deprecated"> |
| <p><span class="versionmodified deprecated">Deprecated since version 2.1.0: </span>Passing html literal strings is deprecated. |
| Wrap literal string/bytes input in <code class="docutils literal notranslate"><span class="pre">io.StringIO</span></code>/<code class="docutils literal notranslate"><span class="pre">io.BytesIO</span></code> instead.</p> |
| </div> |
| </p></li> |
| <li><p><strong>match</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><em>compiled regular expression</em><em>, </em><em>optional</em>) – The set of tables containing text matching this regex or string will be |
| returned. Unless the HTML is extremely simple you will probably need to |
| pass a non-empty string here. Defaults to ‘.+’ (match any non-empty |
| string). The default value will return all tables contained on a page. |
| This value is converted to a regular expression so that there is |
| consistent behavior between Beautiful Soup and lxml.</p></li> |
| <li><p><strong>flavor</strong> (<em>{"lxml"</em><em>, </em><em>"html5lib"</em><em>, </em><em>"bs4"}</em><em> or </em><em>list-like</em><em>, </em><em>optional</em>) – The parsing engine (or list of parsing engines) to use. ‘bs4’ and |
| ‘html5lib’ are synonymous with each other, they are both there for |
| backwards compatibility. The default of <code class="docutils literal notranslate"><span class="pre">None</span></code> tries to use <code class="docutils literal notranslate"><span class="pre">lxml</span></code> |
| to parse and if that fails it falls back on <code class="docutils literal notranslate"><span class="pre">bs4</span></code> + <code class="docutils literal notranslate"><span class="pre">html5lib</span></code>.</p></li> |
| <li><p><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em> or </em><em>list-like</em><em>, </em><em>optional</em>) – The row (or list of rows for a <a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.MultiIndex.html#pandas.MultiIndex" title="(in pandas v3.0.0.dev0+2296.gfcd2a5d50f)"><code class="xref py py-class docutils literal notranslate"><span class="pre">MultiIndex</span></code></a>) to use to |
| make the columns headers.</p></li> |
| <li><p><strong>index_col</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em> or </em><em>list-like</em><em>, </em><em>optional</em>) – The column (or list of columns) to use to create the index.</p></li> |
| <li><p><strong>skiprows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>list-like</em><em> or </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#slice" title="(in Python v3.13)"><em>slice</em></a><em>, </em><em>optional</em>) – Number of rows to skip after parsing the column integer. 0-based. If a |
| sequence of integers or a slice is given, will skip the rows indexed by |
| that sequence. Note that a single element sequence means ‘skip the nth |
| row’ whereas an integer means ‘skip n rows’.</p></li> |
| <li><p><strong>attrs</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>This is a dictionary of attributes that you can pass to use to identify |
| the table in the HTML. These are not checked for validity before being |
| passed to lxml or Beautiful Soup. However, these attributes must be |
| valid HTML table attributes to work correctly. For example,</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">attrs</span> <span class="o">=</span> <span class="p">{</span><span class="s1">'id'</span><span class="p">:</span> <span class="s1">'table'</span><span class="p">}</span> |
| </pre></div> |
| </div> |
| <p>is a valid attribute dictionary because the ‘id’ HTML tag attribute is |
| a valid HTML attribute for <em>any</em> HTML tag as per <a class="reference external" href="https://html.spec.whatwg.org/multipage/dom.html#global-attributes">this document</a>.</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">attrs</span> <span class="o">=</span> <span class="p">{</span><span class="s1">'asdf'</span><span class="p">:</span> <span class="s1">'table'</span><span class="p">}</span> |
| </pre></div> |
| </div> |
| <p>is <em>not</em> a valid attribute dictionary because ‘asdf’ is not a valid |
| HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 |
| table attributes can be found <a class="reference external" href="http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2">here</a>. A |
| working draft of the HTML 5 spec can be found <a class="reference external" href="https://html.spec.whatwg.org/multipage/tables.html">here</a>. It contains the |
| latest information on table attributes for the modern web.</p> |
| </p></li> |
| <li><p><strong>parse_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – See <a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a> for more details.</p></li> |
| <li><p><strong>thousands</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em>) – Separator to use to parse thousands. Defaults to <code class="docutils literal notranslate"><span class="pre">','</span></code>.</p></li> |
| <li><p><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em>) – The encoding used to decode the web page. Defaults to <code class="docutils literal notranslate"><span class="pre">None</span></code>.``None`` |
| preserves the previous encoding behavior, which depends on the |
| underlying parser library (e.g., the parser library will try to use |
| the encoding provided by the document).</p></li> |
| <li><p><strong>decimal</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default '.'</em>) – Character to recognize as decimal point (e.g. use ‘,’ for European |
| data).</p></li> |
| <li><p><strong>converters</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>default None</em>) – Dict of functions for converting values in certain columns. Keys can |
| either be integers or column labels, values are functions that take one |
| input argument, the cell (not column) content, and return the |
| transformed content.</p></li> |
| <li><p><strong>na_values</strong> (<em>iterable</em><em>, </em><em>default None</em>) – Custom NA values.</p></li> |
| <li><p><strong>keep_default_na</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – If na_values are specified and keep_default_na is False the default NaN |
| values are overridden, otherwise they’re appended to.</p></li> |
| <li><p><strong>displayed_only</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – Whether elements with “display: none” should be parsed.</p></li> |
| <li><p><strong>extract_links</strong> (<em>{None</em><em>, </em><em>"all"</em><em>, </em><em>"header"</em><em>, </em><em>"body"</em><em>, </em><em>"footer"}</em>) – <p>Table elements in the specified section(s) with <a> tags will have their |
| href extracted.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 1.5.0.</span></p> |
| </div> |
| </p></li> |
| <li><p><strong>dtype_backend</strong> (<em>{'numpy_nullable'</em><em>, </em><em>'pyarrow'}</em><em>, </em><em>default 'numpy_nullable'</em>) – <p>Back-end data type applied to the resultant <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> |
| (still experimental). Behaviour is as follows:</p> |
| <ul> |
| <li><p><code class="docutils literal notranslate"><span class="pre">"numpy_nullable"</span></code>: returns nullable-dtype-backed <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> |
| (default).</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">"pyarrow"</span></code>: returns pyarrow-backed nullable <code class="xref py py-class docutils literal notranslate"><span class="pre">ArrowDtype</span></code> |
| DeferredDataFrame.</p></li> |
| </ul> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 2.0.</span></p> |
| </div> |
| </p></li> |
| <li><p><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 2.1.0.</span></p> |
| </div> |
| </p></li> |
| </ul> |
| </dd> |
| <dt class="field-even">Returns<span class="colon">:</span></dt> |
| <dd class="field-even"><p>A list of DeferredDataFrames.</p> |
| </dd> |
| <dt class="field-odd">Return type<span class="colon">:</span></dt> |
| <dd class="field-odd"><p>dfs</p> |
| </dd> |
| </dl> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="admonition-title">See also</p> |
| <dl class="simple"> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-obj docutils literal notranslate"><span class="pre">read_csv</span></code></a></dt><dd><p>Read a comma-separated values (csv) file into DeferredDataFrame.</p> |
| </dd> |
| </dl> |
| </div> |
| <p class="rubric">Notes</p> |
| <p>Before using this function you should read the <a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#io-html-gotchas" title="(in pandas v3.0.0.dev0+2296.gfcd2a5d50f)"><span class="xref std std-ref">gotchas about the |
| HTML parsing libraries</span></a>.</p> |
| <p>Expect to do some cleanup after you call this function. For example, you |
| might need to manually assign column names if the column names are |
| converted to NaN when you pass the <cite>header=0</cite> argument. We try to assume as |
| little as possible about the structure of the table and push the |
| idiosyncrasies of the HTML contained in the table to the user.</p> |
| <p>This function searches for <code class="docutils literal notranslate"><span class="pre"><table></span></code> elements and only for <code class="docutils literal notranslate"><span class="pre"><tr></span></code> |
| and <code class="docutils literal notranslate"><span class="pre"><th></span></code> rows and <code class="docutils literal notranslate"><span class="pre"><td></span></code> elements within each <code class="docutils literal notranslate"><span class="pre"><tr></span></code> or <code class="docutils literal notranslate"><span class="pre"><th></span></code> |
| element in the table. <code class="docutils literal notranslate"><span class="pre"><td></span></code> stands for “table data”. This function |
| attempts to properly handle <code class="docutils literal notranslate"><span class="pre">colspan</span></code> and <code class="docutils literal notranslate"><span class="pre">rowspan</span></code> attributes. |
| If the function has a <code class="docutils literal notranslate"><span class="pre"><thead></span></code> argument, it is used to construct |
| the header, otherwise the function attempts to find the header within |
| the body (by putting rows with only <code class="docutils literal notranslate"><span class="pre"><th></span></code> elements into the header).</p> |
| <p>Similar to <a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a> the <cite>header</cite> argument is applied |
| <strong>after</strong> <cite>skiprows</cite> is applied.</p> |
| <p>This function will <em>always</em> return a list of <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> <em>or</em> |
| it will fail, e.g., it will <em>not</em> return an empty list.</p> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="go">See the :ref:`read_html documentation in the IO section of the docs</span> |
| <span class="go"><io.read_html>` for some examples of reading in HTML tables.</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="py function"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.to_html"> |
| <span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">to_html</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">df</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#to_html"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#apache_beam.dataframe.io.to_html" title="Link to this definition"></a></dt> |
| <dd><p>Render a DataFrame as an HTML table.</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><ul class="simple"> |
| <li><p><strong>buf</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>Path</em><em> or </em><em>StringIO-like</em><em>, </em><em>optional</em><em>, </em><em>default None</em>) – Buffer to write to. If None, the output is returned as a string.</p></li> |
| <li><p><strong>columns</strong> (<em>array-like</em><em>, </em><em>optional</em><em>, </em><em>default None</em>) – The subset of columns to write. Writes all columns by default.</p></li> |
| <li><p><strong>col_space</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em> of </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em>) – The minimum width of each column in CSS length units. An int is assumed to be px units..</p></li> |
| <li><p><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – Whether to print column labels, default True.</p></li> |
| <li><p><strong>index</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em><em>, </em><em>default True</em>) – Whether to print index (row) labels.</p></li> |
| <li><p><strong>na_rep</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em><em>, </em><em>default 'NaN'</em>) – String representation of <code class="docutils literal notranslate"><span class="pre">NaN</span></code> to use.</p></li> |
| <li><p><strong>formatters</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><em>tuple</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em> of </em><em>one-param. functions</em><em>, </em><em>optional</em>) – Formatter functions to apply to columns’ elements by position or |
| name. |
| The result of each function must be a unicode string. |
| List/tuple must be of length equal to the number of columns.</p></li> |
| <li><p><strong>float_format</strong> (<em>one-parameter function</em><em>, </em><em>optional</em><em>, </em><em>default None</em>) – Formatter function to apply to columns’ elements if they are |
| floats. This function must return a unicode string and will be |
| applied only to the non-<code class="docutils literal notranslate"><span class="pre">NaN</span></code> elements, with <code class="docutils literal notranslate"><span class="pre">NaN</span></code> being |
| handled by <code class="docutils literal notranslate"><span class="pre">na_rep</span></code>.</p></li> |
| <li><p><strong>sparsify</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em><em>, </em><em>default True</em>) – Set to False for a DeferredDataFrame with a hierarchical index to print |
| every multiindex key at each row.</p></li> |
| <li><p><strong>index_names</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em><em>, </em><em>default True</em>) – Prints the names of the indexes.</p></li> |
| <li><p><strong>justify</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default None</em>) – <p>How to justify the column labels. If None uses the option from |
| the print configuration (controlled by set_option), ‘right’ out |
| of the box. Valid values are</p> |
| <ul> |
| <li><p>left</p></li> |
| <li><p>right</p></li> |
| <li><p>center</p></li> |
| <li><p>justify</p></li> |
| <li><p>justify-all</p></li> |
| <li><p>start</p></li> |
| <li><p>end</p></li> |
| <li><p>inherit</p></li> |
| <li><p>match-parent</p></li> |
| <li><p>initial</p></li> |
| <li><p>unset.</p></li> |
| </ul> |
| </p></li> |
| <li><p><strong>max_rows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – Maximum number of rows to display in the console.</p></li> |
| <li><p><strong>max_cols</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – Maximum number of columns to display in the console.</p></li> |
| <li><p><strong>show_dimensions</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default False</em>) – Display DeferredDataFrame dimensions (number of rows by number of columns).</p></li> |
| <li><p><strong>decimal</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default '.'</em>) – Character recognized as decimal separator, e.g. ‘,’ in Europe.</p></li> |
| <li><p><strong>bold_rows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – Make the row labels bold in the output.</p></li> |
| <li><p><strong>classes</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><em>tuple</em></a><em>, </em><em>default None</em>) – CSS class(es) to apply to the resulting html table.</p></li> |
| <li><p><strong>escape</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – Convert the characters <, >, and & to HTML-safe sequences.</p></li> |
| <li><p><strong>notebook</strong> (<em>{True</em><em>, </em><em>False}</em><em>, </em><em>default False</em>) – Whether the generated HTML is for IPython Notebook.</p></li> |
| <li><p><strong>border</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – A <code class="docutils literal notranslate"><span class="pre">border=border</span></code> attribute is included in the opening |
| <cite><table></cite> tag. Default <code class="docutils literal notranslate"><span class="pre">pd.options.display.html.border</span></code>.</p></li> |
| <li><p><strong>table_id</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em>) – A css id is included in the opening <cite><table></cite> tag if specified.</p></li> |
| <li><p><strong>render_links</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default False</em>) – Convert URLs to HTML links.</p></li> |
| <li><p><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default "utf-8"</em>) – Set character encoding.</p></li> |
| </ul> |
| </dd> |
| <dt class="field-even">Returns<span class="colon">:</span></dt> |
| <dd class="field-even"><p>If buf is None, returns the result as a string. Otherwise returns |
| None.</p> |
| </dd> |
| <dt class="field-odd">Return type<span class="colon">:</span></dt> |
| <dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)">str</a> or None</p> |
| </dd> |
| </dl> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="admonition-title">See also</p> |
| <dl class="simple"> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">to_string</span></code></dt><dd><p>Convert DeferredDataFrame to a string.</p> |
| </dd> |
| </dl> |
| </div> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="p">{</span><span class="s1">'col1'</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="s1">'col2'</span><span class="p">:</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">3</span><span class="p">]})</span> |
| <span class="gp">>>> </span><span class="n">html_string</span> <span class="o">=</span> <span class="s1">'''<table border="1" class="dataframe"></span> |
| <span class="gp">... </span><span class="s1"> <thead></span> |
| <span class="gp">... </span><span class="s1"> <tr style="text-align: right;"></span> |
| <span class="gp">... </span><span class="s1"> <th></th></span> |
| <span class="gp">... </span><span class="s1"> <th>col1</th></span> |
| <span class="gp">... </span><span class="s1"> <th>col2</th></span> |
| <span class="gp">... </span><span class="s1"> </tr></span> |
| <span class="gp">... </span><span class="s1"> </thead></span> |
| <span class="gp">... </span><span class="s1"> <tbody></span> |
| <span class="gp">... </span><span class="s1"> <tr></span> |
| <span class="gp">... </span><span class="s1"> <th>0</th></span> |
| <span class="gp">... </span><span class="s1"> <td>1</td></span> |
| <span class="gp">... </span><span class="s1"> <td>4</td></span> |
| <span class="gp">... </span><span class="s1"> </tr></span> |
| <span class="gp">... </span><span class="s1"> <tr></span> |
| <span class="gp">... </span><span class="s1"> <th>1</th></span> |
| <span class="gp">... </span><span class="s1"> <td>2</td></span> |
| <span class="gp">... </span><span class="s1"> <td>3</td></span> |
| <span class="gp">... </span><span class="s1"> </tr></span> |
| <span class="gp">... </span><span class="s1"> </tbody></span> |
| <span class="gp">... </span><span class="s1"></table>'''</span> |
| <span class="gp">>>> </span><span class="k">assert</span> <span class="n">html_string</span> <span class="o">==</span> <span class="n">df</span><span class="o">.</span><span class="n">to_html</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="py class"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.ReadViaPandas"> |
| <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">ReadViaPandas</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">format</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">include_indexes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">objects_as_strings</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#ReadViaPandas"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#apache_beam.dataframe.io.ReadViaPandas" title="Link to this definition"></a></dt> |
| <dd><p>Bases: <a class="reference internal" href="apache_beam.transforms.ptransform.html#apache_beam.transforms.ptransform.PTransform" title="apache_beam.transforms.ptransform.PTransform"><code class="xref py py-class docutils literal notranslate"><span class="pre">PTransform</span></code></a></p> |
| <dl class="py method"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.ReadViaPandas.expand"> |
| <span class="sig-name descname"><span class="pre">expand</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">p</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#ReadViaPandas.expand"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#apache_beam.dataframe.io.ReadViaPandas.expand" title="Link to this definition"></a></dt> |
| <dd></dd></dl> |
| |
| </dd></dl> |
| |
| <dl class="py class"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.WriteViaPandas"> |
| <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">WriteViaPandas</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">format</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#WriteViaPandas"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#apache_beam.dataframe.io.WriteViaPandas" title="Link to this definition"></a></dt> |
| <dd><p>Bases: <a class="reference internal" href="apache_beam.transforms.ptransform.html#apache_beam.transforms.ptransform.PTransform" title="apache_beam.transforms.ptransform.PTransform"><code class="xref py py-class docutils literal notranslate"><span class="pre">PTransform</span></code></a></p> |
| <dl class="py method"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.WriteViaPandas.expand"> |
| <span class="sig-name descname"><span class="pre">expand</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">pcoll</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#WriteViaPandas.expand"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#apache_beam.dataframe.io.WriteViaPandas.expand" title="Link to this definition"></a></dt> |
| <dd></dd></dl> |
| |
| </dd></dl> |
| |
| <dl class="py function"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.read_excel"> |
| <span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">read_excel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.read_excel" title="Link to this definition"></a></dt> |
| <dd><p>Read an Excel file into a <code class="docutils literal notranslate"><span class="pre">pandas</span></code> <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code>.</p> |
| <p>Supports <cite>xls</cite>, <cite>xlsx</cite>, <cite>xlsm</cite>, <cite>xlsb</cite>, <cite>odf</cite>, <cite>ods</cite> and <cite>odt</cite> file extensions |
| read from a local filesystem or URL. Supports an option to read |
| a single sheet or a list of sheets.</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><ul class="simple"> |
| <li><p><strong>io</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#bytes" title="(in Python v3.13)"><em>bytes</em></a><em>, </em><em>ExcelFile</em><em>, </em><em>xlrd.Book</em><em>, </em><em>path object</em><em>, or </em><em>file-like object</em>) – <p>Any valid string path is acceptable. The string could be a URL. Valid |
| URL schemes include http, ftp, s3, and file. For file URLs, a host is |
| expected. A local file could be: <code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.xlsx</span></code>.</p> |
| <p>If you want to pass in a path object, pandas accepts any <code class="docutils literal notranslate"><span class="pre">os.PathLike</span></code>.</p> |
| <p>By file-like object, we refer to objects with a <code class="docutils literal notranslate"><span class="pre">read()</span></code> method, |
| such as a file handle (e.g. via builtin <code class="docutils literal notranslate"><span class="pre">open</span></code> function) |
| or <code class="docutils literal notranslate"><span class="pre">StringIO</span></code>.</p> |
| <div class="deprecated"> |
| <p><span class="versionmodified deprecated">Deprecated since version 2.1.0: </span>Passing byte strings is deprecated. To read from a |
| byte string, wrap it in a <code class="docutils literal notranslate"><span class="pre">BytesIO</span></code> object.</p> |
| </div> |
| </p></li> |
| <li><p><strong>sheet_name</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>, or </em><em>None</em><em>, </em><em>default 0</em>) – <p>Strings are used for sheet names. Integers are used in zero-indexed |
| sheet positions (chart sheets do not count as a sheet position). |
| Lists of strings/integers are used to request multiple sheets. |
| Specify <code class="docutils literal notranslate"><span class="pre">None</span></code> to get all worksheets.</p> |
| <p>Available cases:</p> |
| <ul> |
| <li><p>Defaults to <code class="docutils literal notranslate"><span class="pre">0</span></code>: 1st sheet as a <cite>DeferredDataFrame</cite></p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">1</span></code>: 2nd sheet as a <cite>DeferredDataFrame</cite></p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">"Sheet1"</span></code>: Load sheet with name “Sheet1”</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">[0,</span> <span class="pre">1,</span> <span class="pre">"Sheet5"]</span></code>: Load first, second and sheet named “Sheet5” |
| as a dict of <cite>DeferredDataFrame</cite></p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">None</span></code>: All worksheets.</p></li> |
| </ul> |
| </p></li> |
| <li><p><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em> of </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>default 0</em>) – Row (0-indexed) to use for the column labels of the parsed |
| DeferredDataFrame. If a list of integers is passed those row positions will |
| be combined into a <code class="docutils literal notranslate"><span class="pre">MultiIndex</span></code>. Use None if there is no header.</p></li> |
| <li><p><strong>names</strong> (<em>array-like</em><em>, </em><em>default None</em>) – List of column names to use. If file contains no header row, |
| then you should explicitly pass header=None.</p></li> |
| <li><p><strong>index_col</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em> of </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>default None</em>) – <p>Column (0-indexed) to use as the row labels of the DeferredDataFrame. |
| Pass None if there is no such column. If a list is passed, |
| those columns will be combined into a <code class="docutils literal notranslate"><span class="pre">MultiIndex</span></code>. If a |
| subset of data is selected with <code class="docutils literal notranslate"><span class="pre">usecols</span></code>, index_col |
| is based on the subset.</p> |
| <p>Missing values will be forward filled to allow roundtripping with |
| <code class="docutils literal notranslate"><span class="pre">to_excel</span></code> for <code class="docutils literal notranslate"><span class="pre">merged_cells=True</span></code>. To avoid forward filling the |
| missing values use <code class="docutils literal notranslate"><span class="pre">set_index</span></code> after reading the data instead of |
| <code class="docutils literal notranslate"><span class="pre">index_col</span></code>.</p> |
| </p></li> |
| <li><p><strong>usecols</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>list-like</em><em>, or </em><em>callable</em><em>, </em><em>default None</em>) – <ul> |
| <li><p>If None, then parse all columns.</p></li> |
| <li><p>If str, then indicates comma separated list of Excel column letters |
| and column ranges (e.g. “A:E” or “A,C,E:F”). Ranges are inclusive of |
| both sides.</p></li> |
| <li><p>If list of int, then indicates list of column numbers to be parsed |
| (0-indexed).</p></li> |
| <li><p>If list of string, then indicates list of column names to be parsed.</p></li> |
| <li><p>If callable, then evaluate each column name against it and parse the |
| column if the callable returns <code class="docutils literal notranslate"><span class="pre">True</span></code>.</p></li> |
| </ul> |
| <p>Returns a subset of the columns according to behavior above.</p> |
| </p></li> |
| <li><p><strong>dtype</strong> (<em>Type name</em><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em> of </em><em>column -> type</em><em>, </em><em>default None</em>) – Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32} |
| Use <code class="docutils literal notranslate"><span class="pre">object</span></code> to preserve data as stored in Excel and not interpret dtype, |
| which will necessarily result in <code class="docutils literal notranslate"><span class="pre">object</span></code> dtype. |
| If converters are specified, they will be applied INSTEAD |
| of dtype conversion. |
| If you use <code class="docutils literal notranslate"><span class="pre">None</span></code>, it will infer the dtype of each column based on the data.</p></li> |
| <li><p><strong>engine</strong> (<em>{'openpyxl'</em><em>, </em><em>'calamine'</em><em>, </em><em>'odf'</em><em>, </em><em>'pyxlsb'</em><em>, </em><em>'xlrd'}</em><em>, </em><em>default None</em>) – <p>If io is not a buffer or path, this must be set to identify io. |
| Engine compatibility :</p> |
| <ul> |
| <li><p><code class="docutils literal notranslate"><span class="pre">openpyxl</span></code> supports newer Excel file formats.</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">calamine</span></code> supports Excel (.xls, .xlsx, .xlsm, .xlsb) |
| and OpenDocument (.ods) file formats.</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">odf</span></code> supports OpenDocument file formats (.odf, .ods, .odt).</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">pyxlsb</span></code> supports Binary Excel files.</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">xlrd</span></code> supports old-style Excel files (.xls).</p></li> |
| </ul> |
| <p>When <code class="docutils literal notranslate"><span class="pre">engine=None</span></code>, the following logic will be used to determine the engine:</p> |
| <ul> |
| <li><p>If <code class="docutils literal notranslate"><span class="pre">path_or_buffer</span></code> is an OpenDocument format (.odf, .ods, .odt), |
| then <a class="reference external" href="https://pypi.org/project/odfpy/">odf</a> will be used.</p></li> |
| <li><p>Otherwise if <code class="docutils literal notranslate"><span class="pre">path_or_buffer</span></code> is an xls format, <code class="docutils literal notranslate"><span class="pre">xlrd</span></code> will be used.</p></li> |
| <li><p>Otherwise if <code class="docutils literal notranslate"><span class="pre">path_or_buffer</span></code> is in xlsb format, <code class="docutils literal notranslate"><span class="pre">pyxlsb</span></code> will be used.</p></li> |
| <li><p>Otherwise <code class="docutils literal notranslate"><span class="pre">openpyxl</span></code> will be used.</p></li> |
| </ul> |
| </p></li> |
| <li><p><strong>converters</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>default None</em>) – Dict of functions for converting values in certain columns. Keys can |
| either be integers or column labels, values are functions that take one |
| input argument, the Excel cell content, and return the transformed |
| content.</p></li> |
| <li><p><strong>true_values</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>, </em><em>default None</em>) – Values to consider as True.</p></li> |
| <li><p><strong>false_values</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>, </em><em>default None</em>) – Values to consider as False.</p></li> |
| <li><p><strong>skiprows</strong> (<em>list-like</em><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, or </em><em>callable</em><em>, </em><em>optional</em>) – Line numbers to skip (0-indexed) or number of lines to skip (int) at the |
| start of the file. If callable, the callable function will be evaluated |
| against the row indices, returning True if the row should be skipped and |
| False otherwise. An example of a valid callable argument would be <code class="docutils literal notranslate"><span class="pre">lambda</span> |
| <span class="pre">x:</span> <span class="pre">x</span> <span class="pre">in</span> <span class="pre">[0,</span> <span class="pre">2]</span></code>.</p></li> |
| <li><p><strong>nrows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>default None</em>) – Number of rows to parse.</p></li> |
| <li><p><strong>na_values</strong> (<em>scalar</em><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>list-like</em><em>, or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>default None</em>) – Additional strings to recognize as NA/NaN. If dict passed, specific |
| per-column NA values. By default the following values are interpreted |
| as NaN: ‘’, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, ‘-nan’, |
| ‘1.#IND’, ‘1.#QNAN’, ‘<NA>’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘None’, |
| ‘n/a’, ‘nan’, ‘null’.</p></li> |
| <li><p><strong>keep_default_na</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – <p>Whether or not to include the default NaN values when parsing the data. |
| Depending on whether <code class="docutils literal notranslate"><span class="pre">na_values</span></code> is passed in, the behavior is as follows:</p> |
| <ul> |
| <li><p>If <code class="docutils literal notranslate"><span class="pre">keep_default_na</span></code> is True, and <code class="docutils literal notranslate"><span class="pre">na_values</span></code> are specified, |
| <code class="docutils literal notranslate"><span class="pre">na_values</span></code> is appended to the default NaN values used for parsing.</p></li> |
| <li><p>If <code class="docutils literal notranslate"><span class="pre">keep_default_na</span></code> is True, and <code class="docutils literal notranslate"><span class="pre">na_values</span></code> are not specified, only |
| the default NaN values are used for parsing.</p></li> |
| <li><p>If <code class="docutils literal notranslate"><span class="pre">keep_default_na</span></code> is False, and <code class="docutils literal notranslate"><span class="pre">na_values</span></code> are specified, only |
| the NaN values specified <code class="docutils literal notranslate"><span class="pre">na_values</span></code> are used for parsing.</p></li> |
| <li><p>If <code class="docutils literal notranslate"><span class="pre">keep_default_na</span></code> is False, and <code class="docutils literal notranslate"><span class="pre">na_values</span></code> are not specified, no |
| strings will be parsed as NaN.</p></li> |
| </ul> |
| <p>Note that if <cite>na_filter</cite> is passed in as False, the <code class="docutils literal notranslate"><span class="pre">keep_default_na</span></code> and |
| <code class="docutils literal notranslate"><span class="pre">na_values</span></code> parameters will be ignored.</p> |
| </p></li> |
| <li><p><strong>na_filter</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – Detect missing value markers (empty strings and the value of na_values). In |
| data without any NAs, passing <code class="docutils literal notranslate"><span class="pre">na_filter=False</span></code> can improve the |
| performance of reading a large file.</p></li> |
| <li><p><strong>verbose</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default False</em>) – Indicate number of NA values placed in non-numeric columns.</p></li> |
| <li><p><strong>parse_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>list-like</em><em>, or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>default False</em>) – <p>The behavior is as follows:</p> |
| <ul> |
| <li><p><code class="docutils literal notranslate"><span class="pre">bool</span></code>. If True -> try parsing the index.</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">list</span></code> of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 |
| each as a separate date column.</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">list</span></code> of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as |
| a single date column.</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">dict</span></code>, e.g. {‘foo’ : [1, 3]} -> parse columns 1, 3 as date and call |
| result ‘foo’</p></li> |
| </ul> |
| <p>If a column or index contains an unparsable date, the entire column or |
| index will be returned unaltered as an object data type. If you don`t want to |
| parse some cells as date just change their type in Excel to “Text”. |
| For non-standard datetime parsing, use <code class="docutils literal notranslate"><span class="pre">pd.to_datetime</span></code> after <code class="docutils literal notranslate"><span class="pre">pd.read_excel</span></code>.</p> |
| <p>Note: A fast-path exists for iso8601-formatted dates.</p> |
| </p></li> |
| <li><p><strong>date_parser</strong> (<em>function</em><em>, </em><em>optional</em>) – <p>Function to use for converting a sequence of string columns to an array of |
| datetime instances. The default uses <code class="docutils literal notranslate"><span class="pre">dateutil.parser.parser</span></code> to do the |
| conversion. Pandas will try to call <cite>date_parser</cite> in three different ways, |
| advancing to the next if an exception occurs: 1) Pass one or more arrays |
| (as defined by <cite>parse_dates</cite>) as arguments; 2) concatenate (row-wise) the |
| string values from the columns defined by <cite>parse_dates</cite> into a single array |
| and pass that; and 3) call <cite>date_parser</cite> once for each row using one or |
| more strings (corresponding to the columns defined by <cite>parse_dates</cite>) as |
| arguments.</p> |
| <div class="deprecated"> |
| <p><span class="versionmodified deprecated">Deprecated since version 2.0.0: </span>Use <code class="docutils literal notranslate"><span class="pre">date_format</span></code> instead, or read in as <code class="docutils literal notranslate"><span class="pre">object</span></code> and then apply |
| <code class="xref py py-func docutils literal notranslate"><span class="pre">to_datetime()</span></code> as-needed.</p> |
| </div> |
| </p></li> |
| <li><p><strong>date_format</strong> (str or dict of column -> format, default <code class="docutils literal notranslate"><span class="pre">None</span></code>) – <p>If used in conjunction with <code class="docutils literal notranslate"><span class="pre">parse_dates</span></code>, will parse dates according to this |
| format. For anything more complex, |
| please read in as <code class="docutils literal notranslate"><span class="pre">object</span></code> and then apply <code class="xref py py-func docutils literal notranslate"><span class="pre">to_datetime()</span></code> as-needed.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 2.0.0.</span></p> |
| </div> |
| </p></li> |
| <li><p><strong>thousands</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default None</em>) – Thousands separator for parsing string columns to numeric. Note that |
| this parameter is only necessary for columns stored as TEXT in Excel, |
| any numeric columns will automatically be parsed, regardless of display |
| format.</p></li> |
| <li><p><strong>decimal</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default '.'</em>) – <p>Character to recognize as decimal point for parsing string columns to numeric. |
| Note that this parameter is only necessary for columns stored as TEXT in Excel, |
| any numeric columns will automatically be parsed, regardless of display |
| format.(e.g. use ‘,’ for European data).</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 1.4.0.</span></p> |
| </div> |
| </p></li> |
| <li><p><strong>comment</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default None</em>) – Comments out remainder of line. Pass a character or characters to this |
| argument to indicate comments in the input file. Any data between the |
| comment string and the end of the current line is ignored.</p></li> |
| <li><p><strong>skipfooter</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>default 0</em>) – Rows at the end to skip (0-indexed).</p></li> |
| <li><p><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| </p></li> |
| <li><p><strong>dtype_backend</strong> (<em>{'numpy_nullable'</em><em>, </em><em>'pyarrow'}</em><em>, </em><em>default 'numpy_nullable'</em>) – <p>Back-end data type applied to the resultant <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> |
| (still experimental). Behaviour is as follows:</p> |
| <ul> |
| <li><p><code class="docutils literal notranslate"><span class="pre">"numpy_nullable"</span></code>: returns nullable-dtype-backed <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> |
| (default).</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">"pyarrow"</span></code>: returns pyarrow-backed nullable <code class="xref py py-class docutils literal notranslate"><span class="pre">ArrowDtype</span></code> |
| DeferredDataFrame.</p></li> |
| </ul> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 2.0.</span></p> |
| </div> |
| </p></li> |
| <li><p><strong>engine_kwargs</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>optional</em>) – Arbitrary keyword arguments passed to excel engine.</p></li> |
| </ul> |
| </dd> |
| <dt class="field-even">Returns<span class="colon">:</span></dt> |
| <dd class="field-even"><p>DeferredDataFrame from the passed in Excel file. See notes in sheet_name |
| argument for more information on when a dict of DeferredDataFrames is returned.</p> |
| </dd> |
| <dt class="field-odd">Return type<span class="colon">:</span></dt> |
| <dd class="field-odd"><p><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a> or <a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)">dict</a> of DeferredDataFrames</p> |
| </dd> |
| </dl> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="admonition-title">See also</p> |
| <dl class="simple"> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">DeferredDataFrame.to_excel</span></code></dt><dd><p>Write DeferredDataFrame to an Excel file.</p> |
| </dd> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">DeferredDataFrame.to_csv</span></code></dt><dd><p>Write DeferredDataFrame to a comma-separated values (csv) file.</p> |
| </dd> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-obj docutils literal notranslate"><span class="pre">read_csv</span></code></a></dt><dd><p>Read a comma-separated values (csv) file into DeferredDataFrame.</p> |
| </dd> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_fwf" title="apache_beam.dataframe.io.read_fwf"><code class="xref py py-obj docutils literal notranslate"><span class="pre">read_fwf</span></code></a></dt><dd><p>Read a table of fixed-width formatted lines into DeferredDataFrame.</p> |
| </dd> |
| </dl> |
| </div> |
| <p class="rubric">Notes</p> |
| <p>For specific information on the methods used for each Excel engine, refer to the pandas |
| <a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#io-excel-reader" title="(in pandas v3.0.0.dev0+2296.gfcd2a5d50f)"><span class="xref std std-ref">user guide</span></a></p> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="go">The file can be read using the file name as string or an open file object:</span> |
| |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="s1">'tmp.xlsx'</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span> |
| <span class="go"> Name Value</span> |
| <span class="go">0 string1 1</span> |
| <span class="go">1 string2 2</span> |
| <span class="go">2 #Comment 3</span> |
| |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="nb">open</span><span class="p">(</span><span class="s1">'tmp.xlsx'</span><span class="p">,</span> <span class="s1">'rb'</span><span class="p">),</span> |
| <span class="gp">... </span> <span class="n">sheet_name</span><span class="o">=</span><span class="s1">'Sheet3'</span><span class="p">)</span> |
| <span class="go"> Unnamed: 0 Name Value</span> |
| <span class="go">0 0 string1 1</span> |
| <span class="go">1 1 string2 2</span> |
| <span class="go">2 2 #Comment 3</span> |
| |
| <span class="go">Index and header can be specified via the `index_col` and `header` arguments</span> |
| |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="s1">'tmp.xlsx'</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span> |
| <span class="go"> 0 1 2</span> |
| <span class="go">0 NaN Name Value</span> |
| <span class="go">1 0.0 string1 1</span> |
| <span class="go">2 1.0 string2 2</span> |
| <span class="go">3 2.0 #Comment 3</span> |
| |
| <span class="go">Column types are inferred but can be explicitly specified</span> |
| |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="s1">'tmp.xlsx'</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> |
| <span class="gp">... </span> <span class="n">dtype</span><span class="o">=</span><span class="p">{</span><span class="s1">'Name'</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="s1">'Value'</span><span class="p">:</span> <span class="nb">float</span><span class="p">})</span> |
| <span class="go"> Name Value</span> |
| <span class="go">0 string1 1.0</span> |
| <span class="go">1 string2 2.0</span> |
| <span class="go">2 #Comment 3.0</span> |
| |
| <span class="go">True, False, and NA values, and thousands separators have defaults,</span> |
| <span class="go">but can be explicitly specified, too. Supply the values you would like</span> |
| <span class="go">as strings or lists of strings!</span> |
| |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="s1">'tmp.xlsx'</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> |
| <span class="gp">... </span> <span class="n">na_values</span><span class="o">=</span><span class="p">[</span><span class="s1">'string1'</span><span class="p">,</span> <span class="s1">'string2'</span><span class="p">])</span> |
| <span class="go"> Name Value</span> |
| <span class="go">0 NaN 1</span> |
| <span class="go">1 NaN 2</span> |
| <span class="go">2 #Comment 3</span> |
| |
| <span class="go">Comment lines in the excel input file can be skipped using the</span> |
| <span class="go">``comment`` kwarg.</span> |
| |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="s1">'tmp.xlsx'</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">comment</span><span class="o">=</span><span class="s1">'#'</span><span class="p">)</span> |
| <span class="go"> Name Value</span> |
| <span class="go">0 string1 1.0</span> |
| <span class="go">1 string2 2.0</span> |
| <span class="go">2 None NaN</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="py function"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.read_feather"> |
| <span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">read_feather</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.read_feather" title="Link to this definition"></a></dt> |
| <dd><p>Load a feather-format object from the file path.</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><ul class="simple"> |
| <li><p><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>path object</em><em>, or </em><em>file-like object</em>) – String, path object (implementing <code class="docutils literal notranslate"><span class="pre">os.PathLike[str]</span></code>), or file-like |
| object implementing a binary <code class="docutils literal notranslate"><span class="pre">read()</span></code> function. The string could be a URL. |
| Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is |
| expected. A local file could be: <code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.feather</span></code>.</p></li> |
| <li><p><strong>columns</strong> (<em>sequence</em><em>, </em><em>default None</em>) – If not provided, all columns are read.</p></li> |
| <li><p><strong>use_threads</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – Whether to parallelize reading using multiple threads.</p></li> |
| <li><p><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| </p></li> |
| <li><p><strong>dtype_backend</strong> (<em>{'numpy_nullable'</em><em>, </em><em>'pyarrow'}</em><em>, </em><em>default 'numpy_nullable'</em>) – <p>Back-end data type applied to the resultant <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> |
| (still experimental). Behaviour is as follows:</p> |
| <ul> |
| <li><p><code class="docutils literal notranslate"><span class="pre">"numpy_nullable"</span></code>: returns nullable-dtype-backed <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> |
| (default).</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">"pyarrow"</span></code>: returns pyarrow-backed nullable <code class="xref py py-class docutils literal notranslate"><span class="pre">ArrowDtype</span></code> |
| DeferredDataFrame.</p></li> |
| </ul> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 2.0.</span></p> |
| </div> |
| </p></li> |
| </ul> |
| </dd> |
| <dt class="field-even">Return type<span class="colon">:</span></dt> |
| <dd class="field-even"><p><a class="reference external" href="https://docs.python.org/3/library/functions.html#type" title="(in Python v3.13)">type</a> of object stored in file</p> |
| </dd> |
| </dl> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_feather</span><span class="p">(</span><span class="s2">"path/to/file.feather"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="py function"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.read_parquet"> |
| <span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">read_parquet</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.read_parquet" title="Link to this definition"></a></dt> |
| <dd><p>Load a parquet object from the file path, returning a DataFrame.</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><ul class="simple"> |
| <li><p><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>path object</em><em> or </em><em>file-like object</em>) – String, path object (implementing <code class="docutils literal notranslate"><span class="pre">os.PathLike[str]</span></code>), or file-like |
| object implementing a binary <code class="docutils literal notranslate"><span class="pre">read()</span></code> function. |
| The string could be a URL. Valid URL schemes include http, ftp, s3, |
| gs, and file. For file URLs, a host is expected. A local file could be: |
| <code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.parquet</span></code>. |
| A file URL can also be a path to a directory that contains multiple |
| partitioned parquet files. Both pyarrow and fastparquet support |
| paths to directories as well as file URLs. A directory path could be: |
| <code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/tables</span></code> or <code class="docutils literal notranslate"><span class="pre">s3://bucket/partition_dir</span></code>.</p></li> |
| <li><p><strong>engine</strong> (<em>{'auto'</em><em>, </em><em>'pyarrow'</em><em>, </em><em>'fastparquet'}</em><em>, </em><em>default 'auto'</em>) – <p>Parquet library to use. If ‘auto’, then the option |
| <code class="docutils literal notranslate"><span class="pre">io.parquet.engine</span></code> is used. The default <code class="docutils literal notranslate"><span class="pre">io.parquet.engine</span></code> |
| behavior is to try ‘pyarrow’, falling back to ‘fastparquet’ if |
| ‘pyarrow’ is unavailable.</p> |
| <p>When using the <code class="docutils literal notranslate"><span class="pre">'pyarrow'</span></code> engine and no storage options are provided |
| and a filesystem is implemented by both <code class="docutils literal notranslate"><span class="pre">pyarrow.fs</span></code> and <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> |
| (e.g. “s3://”), then the <code class="docutils literal notranslate"><span class="pre">pyarrow.fs</span></code> filesystem is attempted first. |
| Use the filesystem keyword with an instantiated fsspec filesystem |
| if you wish to use its implementation.</p> |
| </p></li> |
| <li><p><strong>columns</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>, </em><em>default=None</em>) – If not None, only these columns will be read from the file.</p></li> |
| <li><p><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 1.3.0.</span></p> |
| </div> |
| </p></li> |
| <li><p><strong>use_nullable_dtypes</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default False</em>) – <p>If True, use dtypes that use <code class="docutils literal notranslate"><span class="pre">pd.NA</span></code> as missing value indicator |
| for the resulting DeferredDataFrame. (only applicable for the <code class="docutils literal notranslate"><span class="pre">pyarrow</span></code> |
| engine) |
| As new dtypes are added that support <code class="docutils literal notranslate"><span class="pre">pd.NA</span></code> in the future, the |
| output with this option will change to use those dtypes. |
| Note: this is an experimental option, and behaviour (e.g. additional |
| support dtypes) may change without notice.</p> |
| <div class="deprecated"> |
| <p><span class="versionmodified deprecated">Deprecated since version 2.0.</span></p> |
| </div> |
| </p></li> |
| <li><p><strong>dtype_backend</strong> (<em>{'numpy_nullable'</em><em>, </em><em>'pyarrow'}</em><em>, </em><em>default 'numpy_nullable'</em>) – <p>Back-end data type applied to the resultant <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> |
| (still experimental). Behaviour is as follows:</p> |
| <ul> |
| <li><p><code class="docutils literal notranslate"><span class="pre">"numpy_nullable"</span></code>: returns nullable-dtype-backed <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> |
| (default).</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">"pyarrow"</span></code>: returns pyarrow-backed nullable <code class="xref py py-class docutils literal notranslate"><span class="pre">ArrowDtype</span></code> |
| DeferredDataFrame.</p></li> |
| </ul> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 2.0.</span></p> |
| </div> |
| </p></li> |
| <li><p><strong>filesystem</strong> (<em>fsspec</em><em> or </em><em>pyarrow filesystem</em><em>, </em><em>default None</em>) – <p>Filesystem object to use when reading the parquet file. Only implemented |
| for <code class="docutils literal notranslate"><span class="pre">engine="pyarrow"</span></code>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 2.1.0.</span></p> |
| </div> |
| </p></li> |
| <li><p><strong>filters</strong> (<em>List</em><em>[</em><em>Tuple</em><em>] or </em><em>List</em><em>[</em><em>List</em><em>[</em><em>Tuple</em><em>]</em><em>]</em><em>, </em><em>default None</em>) – <p>To filter out data. |
| Filter syntax: [[(column, op, val), …],…] |
| where op is [==, =, >, >=, <, <=, !=, in, not in] |
| The innermost tuples are transposed into a set of filters applied |
| through an <cite>AND</cite> operation. |
| The outer list combines these sets of filters through an <cite>OR</cite> |
| operation. |
| A single list of tuples can also be used, meaning that no <cite>OR</cite> |
| operation between set of filters is to be conducted.</p> |
| <p>Using this argument will NOT result in row-wise filtering of the final |
| partitions unless <code class="docutils literal notranslate"><span class="pre">engine="pyarrow"</span></code> is also specified. For |
| other engines, filtering is only performed at the partition level, that is, |
| to prevent the loading of some row-groups and/or files.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 2.1.0.</span></p> |
| </div> |
| </p></li> |
| <li><p><strong>**kwargs</strong> – Any additional kwargs are passed to the engine.</p></li> |
| </ul> |
| </dd> |
| <dt class="field-even">Return type<span class="colon">:</span></dt> |
| <dd class="field-even"><p><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a></p> |
| </dd> |
| </dl> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="admonition-title">See also</p> |
| <dl class="simple"> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">DeferredDataFrame.to_parquet</span></code></dt><dd><p>Create a parquet object that serializes a DeferredDataFrame.</p> |
| </dd> |
| </dl> |
| </div> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">original_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span> |
| <span class="gp">... </span> <span class="p">{</span><span class="s2">"foo"</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">5</span><span class="p">),</span> <span class="s2">"bar"</span><span class="p">:</span> <span class="nb">range</span><span class="p">(</span><span class="mi">5</span><span class="p">,</span> <span class="mi">10</span><span class="p">)}</span> |
| <span class="gp">... </span> <span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">original_df</span> |
| <span class="go"> foo bar</span> |
| <span class="go">0 0 5</span> |
| <span class="go">1 1 6</span> |
| <span class="go">2 2 7</span> |
| <span class="go">3 3 8</span> |
| <span class="go">4 4 9</span> |
| <span class="gp">>>> </span><span class="n">df_parquet_bytes</span> <span class="o">=</span> <span class="n">original_df</span><span class="o">.</span><span class="n">to_parquet</span><span class="p">()</span> |
| <span class="gp">>>> </span><span class="kn">from</span><span class="w"> </span><span class="nn">io</span><span class="w"> </span><span class="kn">import</span> <span class="n">BytesIO</span> |
| <span class="gp">>>> </span><span class="n">restored_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_parquet</span><span class="p">(</span><span class="n">BytesIO</span><span class="p">(</span><span class="n">df_parquet_bytes</span><span class="p">))</span> |
| <span class="gp">>>> </span><span class="n">restored_df</span> |
| <span class="go"> foo bar</span> |
| <span class="go">0 0 5</span> |
| <span class="go">1 1 6</span> |
| <span class="go">2 2 7</span> |
| <span class="go">3 3 8</span> |
| <span class="go">4 4 9</span> |
| <span class="gp">>>> </span><span class="n">restored_df</span><span class="o">.</span><span class="n">equals</span><span class="p">(</span><span class="n">original_df</span><span class="p">)</span> |
| <span class="go">True</span> |
| <span class="gp">>>> </span><span class="n">restored_bar</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_parquet</span><span class="p">(</span><span class="n">BytesIO</span><span class="p">(</span><span class="n">df_parquet_bytes</span><span class="p">),</span> <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s2">"bar"</span><span class="p">])</span> |
| <span class="gp">>>> </span><span class="n">restored_bar</span> |
| <span class="go"> bar</span> |
| <span class="go">0 5</span> |
| <span class="go">1 6</span> |
| <span class="go">2 7</span> |
| <span class="go">3 8</span> |
| <span class="go">4 9</span> |
| <span class="gp">>>> </span><span class="n">restored_bar</span><span class="o">.</span><span class="n">equals</span><span class="p">(</span><span class="n">original_df</span><span class="p">[[</span><span class="s1">'bar'</span><span class="p">]])</span> |
| <span class="go">True</span> |
| |
| <span class="go">The function uses `kwargs` that are passed directly to the engine.</span> |
| <span class="go">In the following example, we use the `filters` argument of the pyarrow</span> |
| <span class="go">engine to filter the rows of the DataFrame.</span> |
| |
| <span class="go">Since `pyarrow` is the default engine, we can omit the `engine` argument.</span> |
| <span class="go">Note that the `filters` argument is implemented by the `pyarrow` engine,</span> |
| <span class="go">which can benefit from multithreading and also potentially be more</span> |
| <span class="go">economical in terms of memory.</span> |
| |
| <span class="gp">>>> </span><span class="n">sel</span> <span class="o">=</span> <span class="p">[(</span><span class="s2">"foo"</span><span class="p">,</span> <span class="s2">">"</span><span class="p">,</span> <span class="mi">2</span><span class="p">)]</span> |
| <span class="gp">>>> </span><span class="n">restored_part</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_parquet</span><span class="p">(</span><span class="n">BytesIO</span><span class="p">(</span><span class="n">df_parquet_bytes</span><span class="p">),</span> <span class="n">filters</span><span class="o">=</span><span class="n">sel</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">restored_part</span> |
| <span class="go"> foo bar</span> |
| <span class="go">0 3 8</span> |
| <span class="go">1 4 9</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="py function"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.read_sas"> |
| <span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">read_sas</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.read_sas" title="Link to this definition"></a></dt> |
| <dd><p>Read SAS files stored as either XPORT or SAS7BDAT format files.</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><ul class="simple"> |
| <li><p><strong>filepath_or_buffer</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>path object</em><em>, or </em><em>file-like object</em>) – String, path object (implementing <code class="docutils literal notranslate"><span class="pre">os.PathLike[str]</span></code>), or file-like |
| object implementing a binary <code class="docutils literal notranslate"><span class="pre">read()</span></code> function. The string could be a URL. |
| Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is |
| expected. A local file could be: |
| <code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.sas7bdat</span></code>.</p></li> |
| <li><p><strong>format</strong> (<em>str {'xport'</em><em>, </em><em>'sas7bdat'}</em><em> or </em><em>None</em>) – If None, file format is inferred from file extension. If ‘xport’ or |
| ‘sas7bdat’, uses the corresponding format.</p></li> |
| <li><p><strong>index</strong> (<em>identifier</em><em> of </em><em>index column</em><em>, </em><em>defaults to None</em>) – Identifier of column that should be used as index of the DeferredDataFrame.</p></li> |
| <li><p><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default is None</em>) – Encoding for text data. If None, text data are stored as raw bytes.</p></li> |
| <li><p><strong>chunksize</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – Read file <cite>chunksize</cite> lines at a time, returns iterator.</p></li> |
| <li><p><strong>iterator</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>defaults to False</em>) – If True, returns an iterator for reading the file incrementally.</p></li> |
| <li><p><strong>compression</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>default 'infer'</em>) – <p>For on-the-fly decompression of on-disk data. If ‘infer’ and ‘filepath_or_buffer’ is |
| path-like, then detect compression from the following extensions: ‘.gz’, |
| ‘.bz2’, ‘.zip’, ‘.xz’, ‘.zst’, ‘.tar’, ‘.tar.gz’, ‘.tar.xz’ or ‘.tar.bz2’ |
| (otherwise no compression). |
| If using ‘zip’ or ‘tar’, the ZIP file must contain only one data file to be read in. |
| Set to <code class="docutils literal notranslate"><span class="pre">None</span></code> for no decompression. |
| Can also be a dict with key <code class="docutils literal notranslate"><span class="pre">'method'</span></code> set |
| to one of {<code class="docutils literal notranslate"><span class="pre">'zip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'gzip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'bz2'</span></code>, <code class="docutils literal notranslate"><span class="pre">'zstd'</span></code>, <code class="docutils literal notranslate"><span class="pre">'xz'</span></code>, <code class="docutils literal notranslate"><span class="pre">'tar'</span></code>} and |
| other key-value pairs are forwarded to |
| <code class="docutils literal notranslate"><span class="pre">zipfile.ZipFile</span></code>, <code class="docutils literal notranslate"><span class="pre">gzip.GzipFile</span></code>, |
| <code class="docutils literal notranslate"><span class="pre">bz2.BZ2File</span></code>, <code class="docutils literal notranslate"><span class="pre">zstandard.ZstdDecompressor</span></code>, <code class="docutils literal notranslate"><span class="pre">lzma.LZMAFile</span></code> or |
| <code class="docutils literal notranslate"><span class="pre">tarfile.TarFile</span></code>, respectively. |
| As an example, the following could be passed for Zstandard decompression using a |
| custom compression dictionary: |
| <code class="docutils literal notranslate"><span class="pre">compression={'method':</span> <span class="pre">'zstd',</span> <span class="pre">'dict_data':</span> <span class="pre">my_compression_dict}</span></code>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 1.5.0: </span>Added support for <cite>.tar</cite> files.</p> |
| </div> |
| </p></li> |
| </ul> |
| </dd> |
| <dt class="field-even">Returns<span class="colon">:</span></dt> |
| <dd class="field-even"><p><ul class="simple"> |
| <li><p><em>DeferredDataFrame if iterator=False and chunksize=None, else SAS7BDATReader</em></p></li> |
| <li><p><em>or XportReader</em></p></li> |
| </ul> |
| </p> |
| </dd> |
| </dl> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_sas</span><span class="p">(</span><span class="s2">"sas_data.sas7bdat"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="py function"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.read_spss"> |
| <span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">read_spss</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.read_spss" title="Link to this definition"></a></dt> |
| <dd><p>Load an SPSS file from the file path, returning a DataFrame.</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><ul class="simple"> |
| <li><p><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><em>Path</em>) – File path.</p></li> |
| <li><p><strong>usecols</strong> (<em>list-like</em><em>, </em><em>optional</em>) – Return a subset of the columns. If None, return all columns.</p></li> |
| <li><p><strong>convert_categoricals</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default is True</em>) – Convert categorical columns into pd.Categorical.</p></li> |
| <li><p><strong>dtype_backend</strong> (<em>{'numpy_nullable'</em><em>, </em><em>'pyarrow'}</em><em>, </em><em>default 'numpy_nullable'</em>) – <p>Back-end data type applied to the resultant <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> |
| (still experimental). Behaviour is as follows:</p> |
| <ul> |
| <li><p><code class="docutils literal notranslate"><span class="pre">"numpy_nullable"</span></code>: returns nullable-dtype-backed <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> |
| (default).</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">"pyarrow"</span></code>: returns pyarrow-backed nullable <code class="xref py py-class docutils literal notranslate"><span class="pre">ArrowDtype</span></code> |
| DeferredDataFrame.</p></li> |
| </ul> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 2.0.</span></p> |
| </div> |
| </p></li> |
| </ul> |
| </dd> |
| <dt class="field-even">Return type<span class="colon">:</span></dt> |
| <dd class="field-even"><p><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a></p> |
| </dd> |
| </dl> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_spss</span><span class="p">(</span><span class="s2">"spss_data.sav"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="py function"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.read_stata"> |
| <span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">read_stata</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.read_stata" title="Link to this definition"></a></dt> |
| <dd><p>Read Stata file into DataFrame.</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><ul class="simple"> |
| <li><p><strong>filepath_or_buffer</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>path object</em><em> or </em><em>file-like object</em>) – <p>Any valid string path is acceptable. The string could be a URL. Valid |
| URL schemes include http, ftp, s3, and file. For file URLs, a host is |
| expected. A local file could be: <code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.dta</span></code>.</p> |
| <p>If you want to pass in a path object, pandas accepts any <code class="docutils literal notranslate"><span class="pre">os.PathLike</span></code>.</p> |
| <p>By file-like object, we refer to objects with a <code class="docutils literal notranslate"><span class="pre">read()</span></code> method, |
| such as a file handle (e.g. via builtin <code class="docutils literal notranslate"><span class="pre">open</span></code> function) |
| or <code class="docutils literal notranslate"><span class="pre">StringIO</span></code>.</p> |
| </p></li> |
| <li><p><strong>convert_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – Convert date variables to DeferredDataFrame time values.</p></li> |
| <li><p><strong>convert_categoricals</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – Read value labels and convert columns to Categorical/Factor variables.</p></li> |
| <li><p><strong>index_col</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em>) – Column to set as index.</p></li> |
| <li><p><strong>convert_missing</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default False</em>) – Flag indicating whether to convert missing values to their Stata |
| representations. If False, missing values are replaced with nan. |
| If True, columns containing missing values are returned with |
| object data types and missing values are represented by |
| StataMissingValue objects.</p></li> |
| <li><p><strong>preserve_dtypes</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – Preserve Stata datatypes. If False, numeric data are upcast to pandas |
| default types for foreign data (float64 or int64).</p></li> |
| <li><p><strong>columns</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em> or </em><em>None</em>) – Columns to retain. Columns will be returned in the given order. None |
| returns all columns.</p></li> |
| <li><p><strong>order_categoricals</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – Flag indicating whether converted categorical data are ordered.</p></li> |
| <li><p><strong>chunksize</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>default None</em>) – Return StataReader object for iterations, returns chunks with |
| given number of lines.</p></li> |
| <li><p><strong>iterator</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default False</em>) – Return StataReader object.</p></li> |
| <li><p><strong>compression</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>default 'infer'</em>) – <p>For on-the-fly decompression of on-disk data. If ‘infer’ and ‘filepath_or_buffer’ is |
| path-like, then detect compression from the following extensions: ‘.gz’, |
| ‘.bz2’, ‘.zip’, ‘.xz’, ‘.zst’, ‘.tar’, ‘.tar.gz’, ‘.tar.xz’ or ‘.tar.bz2’ |
| (otherwise no compression). |
| If using ‘zip’ or ‘tar’, the ZIP file must contain only one data file to be read in. |
| Set to <code class="docutils literal notranslate"><span class="pre">None</span></code> for no decompression. |
| Can also be a dict with key <code class="docutils literal notranslate"><span class="pre">'method'</span></code> set |
| to one of {<code class="docutils literal notranslate"><span class="pre">'zip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'gzip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'bz2'</span></code>, <code class="docutils literal notranslate"><span class="pre">'zstd'</span></code>, <code class="docutils literal notranslate"><span class="pre">'xz'</span></code>, <code class="docutils literal notranslate"><span class="pre">'tar'</span></code>} and |
| other key-value pairs are forwarded to |
| <code class="docutils literal notranslate"><span class="pre">zipfile.ZipFile</span></code>, <code class="docutils literal notranslate"><span class="pre">gzip.GzipFile</span></code>, |
| <code class="docutils literal notranslate"><span class="pre">bz2.BZ2File</span></code>, <code class="docutils literal notranslate"><span class="pre">zstandard.ZstdDecompressor</span></code>, <code class="docutils literal notranslate"><span class="pre">lzma.LZMAFile</span></code> or |
| <code class="docutils literal notranslate"><span class="pre">tarfile.TarFile</span></code>, respectively. |
| As an example, the following could be passed for Zstandard decompression using a |
| custom compression dictionary: |
| <code class="docutils literal notranslate"><span class="pre">compression={'method':</span> <span class="pre">'zstd',</span> <span class="pre">'dict_data':</span> <span class="pre">my_compression_dict}</span></code>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 1.5.0: </span>Added support for <cite>.tar</cite> files.</p> |
| </div> |
| </p></li> |
| <li><p><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| </p></li> |
| </ul> |
| </dd> |
| <dt class="field-even">Return type<span class="colon">:</span></dt> |
| <dd class="field-even"><p><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a> or pandas.api.typing.StataReader</p> |
| </dd> |
| </dl> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="admonition-title">See also</p> |
| <dl class="simple"> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">io.stata.StataReader</span></code></dt><dd><p>Low-level reader for Stata data files.</p> |
| </dd> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">DeferredDataFrame.to_stata</span></code></dt><dd><p>Export Stata data files.</p> |
| </dd> |
| </dl> |
| </div> |
| <p class="rubric">Notes</p> |
| <p>Categorical variables read through an iterator may not have the same |
| categories and dtype. This occurs when a variable stored in a DTA |
| file is associated to an incomplete set of value labels that only |
| label a strict subset of the values.</p> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="go">Creating a dummy stata for this example</span> |
| |
| <span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">'animal'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'falcon'</span><span class="p">,</span> <span class="s1">'parrot'</span><span class="p">,</span> <span class="s1">'falcon'</span><span class="p">,</span> <span class="s1">'parrot'</span><span class="p">],</span> |
| <span class="gp">... </span> <span class="s1">'speed'</span><span class="p">:</span> <span class="p">[</span><span class="mi">350</span><span class="p">,</span> <span class="mi">18</span><span class="p">,</span> <span class="mi">361</span><span class="p">,</span> <span class="mi">15</span><span class="p">]})</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_stata</span><span class="p">(</span><span class="s1">'animals.dta'</span><span class="p">)</span> |
| |
| <span class="go">Read a Stata dta file:</span> |
| |
| <span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_stata</span><span class="p">(</span><span class="s1">'animals.dta'</span><span class="p">)</span> |
| |
| <span class="go">Read a Stata dta file in 10,000 line chunks:</span> |
| |
| <span class="gp">>>> </span><span class="n">values</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="mi">20_000</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="s2">"uint8"</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s2">"i"</span><span class="p">])</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_stata</span><span class="p">(</span><span class="s1">'filename.dta'</span><span class="p">)</span> |
| |
| <span class="gp">>>> </span><span class="k">with</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_stata</span><span class="p">(</span><span class="s1">'filename.dta'</span><span class="p">,</span> <span class="n">chunksize</span><span class="o">=</span><span class="mi">10000</span><span class="p">)</span> <span class="k">as</span> <span class="n">itr</span><span class="p">:</span> |
| <span class="gp">>>> </span> <span class="k">for</span> <span class="n">chunk</span> <span class="ow">in</span> <span class="n">itr</span><span class="p">:</span> |
| <span class="gp">... </span> <span class="c1"># Operate on a single chunk, e.g., chunk.mean()</span> |
| <span class="gp">... </span> <span class="k">pass</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="py function"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.to_excel"> |
| <span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">to_excel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">df</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.to_excel" title="Link to this definition"></a></dt> |
| <dd><p>Write object to an Excel sheet.</p> |
| <p>To write a single object to an Excel .xlsx file it is only necessary to |
| specify a target file name. To write to multiple sheets it is necessary to |
| create an <cite>ExcelWriter</cite> object with a target file name, and specify a sheet |
| in the file to write to.</p> |
| <p>Multiple sheets may be written to by specifying unique <cite>sheet_name</cite>. |
| With all data written to the file it is necessary to save the changes. |
| Note that creating an <cite>ExcelWriter</cite> object with a file name that already |
| exists will result in the contents of the existing file being erased.</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><ul class="simple"> |
| <li><p><strong>excel_writer</strong> (<em>path-like</em><em>, </em><em>file-like</em><em>, or </em><em>ExcelWriter object</em>) – File path or existing ExcelWriter.</p></li> |
| <li><p><strong>sheet_name</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default 'Sheet1'</em>) – Name of sheet which will contain DeferredDataFrame.</p></li> |
| <li><p><strong>na_rep</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default ''</em>) – Missing data representation.</p></li> |
| <li><p><strong>float_format</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em>) – Format string for floating point numbers. For example |
| <code class="docutils literal notranslate"><span class="pre">float_format="%.2f"</span></code> will format 0.1234 to 0.12.</p></li> |
| <li><p><strong>columns</strong> (<em>sequence</em><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em> of </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em>) – Columns to write.</p></li> |
| <li><p><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em> of </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default True</em>) – Write out the column names. If a list of string is given it is |
| assumed to be aliases for the column names.</p></li> |
| <li><p><strong>index</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – Write row names (index).</p></li> |
| <li><p><strong>index_label</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><em>sequence</em><em>, </em><em>optional</em>) – Column label for index column(s) if desired. If not specified, and |
| <cite>header</cite> and <cite>index</cite> are True, then the index names are used. A |
| sequence should be given if the DeferredDataFrame uses MultiIndex.</p></li> |
| <li><p><strong>startrow</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>default 0</em>) – Upper left cell row to dump data frame.</p></li> |
| <li><p><strong>startcol</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>default 0</em>) – Upper left cell column to dump data frame.</p></li> |
| <li><p><strong>engine</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em>) – Write engine to use, ‘openpyxl’ or ‘xlsxwriter’. You can also set this |
| via the options <code class="docutils literal notranslate"><span class="pre">io.excel.xlsx.writer</span></code> or |
| <code class="docutils literal notranslate"><span class="pre">io.excel.xlsm.writer</span></code>.</p></li> |
| <li><p><strong>merge_cells</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default True</em>) – Write MultiIndex and Hierarchical Rows as merged cells.</p></li> |
| <li><p><strong>inf_rep</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>default 'inf'</em>) – Representation for infinity (there is no native representation for |
| infinity in Excel).</p></li> |
| <li><p><strong>freeze_panes</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.13)"><em>tuple</em></a><em> of </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em> (</em><em>length 2</em><em>)</em><em>, </em><em>optional</em>) – Specifies the one-based bottommost row and rightmost column that |
| is to be frozen.</p></li> |
| <li><p><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 1.2.0.</span></p> |
| </div> |
| </p></li> |
| <li><p><strong>engine_kwargs</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>optional</em>) – Arbitrary keyword arguments passed to excel engine.</p></li> |
| </ul> |
| </dd> |
| </dl> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="admonition-title">See also</p> |
| <dl class="simple"> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.to_csv" title="apache_beam.dataframe.io.to_csv"><code class="xref py py-obj docutils literal notranslate"><span class="pre">to_csv</span></code></a></dt><dd><p>Write DeferredDataFrame to a comma-separated values (csv) file.</p> |
| </dd> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">ExcelWriter</span></code></dt><dd><p>Class for writing DeferredDataFrame objects into excel sheets.</p> |
| </dd> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_excel" title="apache_beam.dataframe.io.read_excel"><code class="xref py py-obj docutils literal notranslate"><span class="pre">read_excel</span></code></a></dt><dd><p>Read an Excel file into a pandas DeferredDataFrame.</p> |
| </dd> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-obj docutils literal notranslate"><span class="pre">read_csv</span></code></a></dt><dd><p>Read a comma-separated values (csv) file into DeferredDataFrame.</p> |
| </dd> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">io.formats.style.Styler.to_excel</span></code></dt><dd><p>Add styles to Excel sheet.</p> |
| </dd> |
| </dl> |
| </div> |
| <p class="rubric">Notes</p> |
| <p>For compatibility with <code class="xref py py-meth docutils literal notranslate"><span class="pre">to_csv()</span></code>, |
| to_excel serializes lists and dicts to strings before writing.</p> |
| <p>Once a workbook has been saved it is not possible to write further |
| data without rewriting the whole workbook.</p> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="go">Create, write to and save a workbook:</span> |
| |
| <span class="gp">>>> </span><span class="n">df1</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">([[</span><span class="s1">'a'</span><span class="p">,</span> <span class="s1">'b'</span><span class="p">],</span> <span class="p">[</span><span class="s1">'c'</span><span class="p">,</span> <span class="s1">'d'</span><span class="p">]],</span> |
| <span class="gp">... </span> <span class="n">index</span><span class="o">=</span><span class="p">[</span><span class="s1">'row 1'</span><span class="p">,</span> <span class="s1">'row 2'</span><span class="p">],</span> |
| <span class="gp">... </span> <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">'col 1'</span><span class="p">,</span> <span class="s1">'col 2'</span><span class="p">])</span> |
| <span class="gp">>>> </span><span class="n">df1</span><span class="o">.</span><span class="n">to_excel</span><span class="p">(</span><span class="s2">"output.xlsx"</span><span class="p">)</span> |
| |
| <span class="go">To specify the sheet name:</span> |
| |
| <span class="gp">>>> </span><span class="n">df1</span><span class="o">.</span><span class="n">to_excel</span><span class="p">(</span><span class="s2">"output.xlsx"</span><span class="p">,</span> |
| <span class="gp">... </span> <span class="n">sheet_name</span><span class="o">=</span><span class="s1">'Sheet_name_1'</span><span class="p">)</span> |
| |
| <span class="go">If you wish to write to more than one sheet in the workbook, it is</span> |
| <span class="go">necessary to specify an ExcelWriter object:</span> |
| |
| <span class="gp">>>> </span><span class="n">df2</span> <span class="o">=</span> <span class="n">df1</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="gp">>>> </span><span class="k">with</span> <span class="n">pd</span><span class="o">.</span><span class="n">ExcelWriter</span><span class="p">(</span><span class="s1">'output.xlsx'</span><span class="p">)</span> <span class="k">as</span> <span class="n">writer</span><span class="p">:</span> |
| <span class="gp">... </span> <span class="n">df1</span><span class="o">.</span><span class="n">to_excel</span><span class="p">(</span><span class="n">writer</span><span class="p">,</span> <span class="n">sheet_name</span><span class="o">=</span><span class="s1">'Sheet_name_1'</span><span class="p">)</span> |
| <span class="gp">... </span> <span class="n">df2</span><span class="o">.</span><span class="n">to_excel</span><span class="p">(</span><span class="n">writer</span><span class="p">,</span> <span class="n">sheet_name</span><span class="o">=</span><span class="s1">'Sheet_name_2'</span><span class="p">)</span> |
| |
| <span class="go">ExcelWriter can also be used to append to an existing Excel file:</span> |
| |
| <span class="gp">>>> </span><span class="k">with</span> <span class="n">pd</span><span class="o">.</span><span class="n">ExcelWriter</span><span class="p">(</span><span class="s1">'output.xlsx'</span><span class="p">,</span> |
| <span class="gp">... </span> <span class="n">mode</span><span class="o">=</span><span class="s1">'a'</span><span class="p">)</span> <span class="k">as</span> <span class="n">writer</span><span class="p">:</span> |
| <span class="gp">... </span> <span class="n">df1</span><span class="o">.</span><span class="n">to_excel</span><span class="p">(</span><span class="n">writer</span><span class="p">,</span> <span class="n">sheet_name</span><span class="o">=</span><span class="s1">'Sheet_name_3'</span><span class="p">)</span> |
| |
| <span class="go">To set the library that is used to write the Excel file,</span> |
| <span class="go">you can pass the `engine` keyword (the default engine is</span> |
| <span class="go">automatically chosen depending on the file extension):</span> |
| |
| <span class="gp">>>> </span><span class="n">df1</span><span class="o">.</span><span class="n">to_excel</span><span class="p">(</span><span class="s1">'output1.xlsx'</span><span class="p">,</span> <span class="n">engine</span><span class="o">=</span><span class="s1">'xlsxwriter'</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="py function"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.to_feather"> |
| <span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">to_feather</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">df</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.to_feather" title="Link to this definition"></a></dt> |
| <dd><p>Write a DataFrame to the binary Feather format.</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><ul class="simple"> |
| <li><p><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>path object</em><em>, </em><em>file-like object</em>) – String, path object (implementing <code class="docutils literal notranslate"><span class="pre">os.PathLike[str]</span></code>), or file-like |
| object implementing a binary <code class="docutils literal notranslate"><span class="pre">write()</span></code> function. If a string or a path, |
| it will be used as Root Directory path when writing a partitioned dataset.</p></li> |
| <li><p><strong>**kwargs</strong> – Additional keywords passed to <code class="xref py py-func docutils literal notranslate"><span class="pre">pyarrow.feather.write_feather()</span></code>. |
| This includes the <cite>compression</cite>, <cite>compression_level</cite>, <cite>chunksize</cite> |
| and <cite>version</cite> keywords.</p></li> |
| </ul> |
| </dd> |
| </dl> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <p class="rubric">Notes</p> |
| <p>This function writes the dataframe as a <a class="reference external" href="https://arrow.apache.org/docs/python/feather.html">feather file</a>. Requires a default |
| index. For saving the DeferredDataFrame with your custom index use a method that |
| supports custom indices e.g. <cite>to_parquet</cite>.</p> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">([[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">]])</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_feather</span><span class="p">(</span><span class="s2">"file.feather"</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="py function"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.to_parquet"> |
| <span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">to_parquet</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">df</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.to_parquet" title="Link to this definition"></a></dt> |
| <dd><p>Write a DataFrame to the binary parquet format.</p> |
| <p>This function writes the dataframe as a <a class="reference external" href="https://parquet.apache.org/">parquet file</a>. You can choose different parquet |
| backends, and have the option of compression. See |
| <a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#io-parquet" title="(in pandas v3.0.0.dev0+2296.gfcd2a5d50f)"><span class="xref std std-ref">the user guide</span></a> for more details.</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><ul class="simple"> |
| <li><p><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>path object</em><em>, </em><em>file-like object</em><em>, or </em><em>None</em><em>, </em><em>default None</em>) – String, path object (implementing <code class="docutils literal notranslate"><span class="pre">os.PathLike[str]</span></code>), or file-like |
| object implementing a binary <code class="docutils literal notranslate"><span class="pre">write()</span></code> function. If None, the result is |
| returned as bytes. If a string or path, it will be used as Root Directory |
| path when writing a partitioned dataset.</p></li> |
| <li><p><strong>engine</strong> (<em>{'auto'</em><em>, </em><em>'pyarrow'</em><em>, </em><em>'fastparquet'}</em><em>, </em><em>default 'auto'</em>) – Parquet library to use. If ‘auto’, then the option |
| <code class="docutils literal notranslate"><span class="pre">io.parquet.engine</span></code> is used. The default <code class="docutils literal notranslate"><span class="pre">io.parquet.engine</span></code> |
| behavior is to try ‘pyarrow’, falling back to ‘fastparquet’ if |
| ‘pyarrow’ is unavailable.</p></li> |
| <li><p><strong>compression</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><em>None</em><em>, </em><em>default 'snappy'</em>) – Name of the compression to use. Use <code class="docutils literal notranslate"><span class="pre">None</span></code> for no compression. |
| Supported options: ‘snappy’, ‘gzip’, ‘brotli’, ‘lz4’, ‘zstd’.</p></li> |
| <li><p><strong>index</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>default None</em>) – If <code class="docutils literal notranslate"><span class="pre">True</span></code>, include the dataframe’s index(es) in the file output. |
| If <code class="docutils literal notranslate"><span class="pre">False</span></code>, they will not be written to the file. |
| If <code class="docutils literal notranslate"><span class="pre">None</span></code>, similar to <code class="docutils literal notranslate"><span class="pre">True</span></code> the dataframe’s index(es) |
| will be saved. However, instead of being saved as values, |
| the RangeIndex will be stored as a range in the metadata so it |
| doesn’t require much space and is faster. Other indexes will |
| be included as columns in the file output.</p></li> |
| <li><p><strong>partition_cols</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>, </em><em>optional</em><em>, </em><em>default None</em>) – Column names by which to partition the dataset. |
| Columns are partitioned in the order they are given. |
| Must be None if path is not a string.</p></li> |
| <li><p><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| </p></li> |
| <li><p><strong>**kwargs</strong> – Additional arguments passed to the parquet library. See |
| <a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#io-parquet" title="(in pandas v3.0.0.dev0+2296.gfcd2a5d50f)"><span class="xref std std-ref">pandas io</span></a> for more details.</p></li> |
| </ul> |
| </dd> |
| <dt class="field-even">Return type<span class="colon">:</span></dt> |
| <dd class="field-even"><p>bytes if no path argument is provided else None</p> |
| </dd> |
| </dl> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="admonition-title">See also</p> |
| <dl class="simple"> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_parquet" title="apache_beam.dataframe.io.read_parquet"><code class="xref py py-obj docutils literal notranslate"><span class="pre">read_parquet</span></code></a></dt><dd><p>Read a parquet file.</p> |
| </dd> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">DeferredDataFrame.to_orc</span></code></dt><dd><p>Write an orc file.</p> |
| </dd> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">DeferredDataFrame.to_csv</span></code></dt><dd><p>Write a csv file.</p> |
| </dd> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">DeferredDataFrame.to_sql</span></code></dt><dd><p>Write to a sql table.</p> |
| </dd> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">DeferredDataFrame.to_hdf</span></code></dt><dd><p>Write to hdf.</p> |
| </dd> |
| </dl> |
| </div> |
| <p class="rubric">Notes</p> |
| <p>This function requires either the <a class="reference external" href="https://pypi.org/project/fastparquet">fastparquet</a> or <a class="reference external" href="https://arrow.apache.org/docs/python/">pyarrow</a> library.</p> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="p">{</span><span class="s1">'col1'</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="s1">'col2'</span><span class="p">:</span> <span class="p">[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">]})</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_parquet</span><span class="p">(</span><span class="s1">'df.parquet.gzip'</span><span class="p">,</span> |
| <span class="gp">... </span> <span class="n">compression</span><span class="o">=</span><span class="s1">'gzip'</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_parquet</span><span class="p">(</span><span class="s1">'df.parquet.gzip'</span><span class="p">)</span> |
| <span class="go"> col1 col2</span> |
| <span class="go">0 1 3</span> |
| <span class="go">1 2 4</span> |
| |
| <span class="go">If you want to get a buffer to the parquet content you can use a io.BytesIO</span> |
| <span class="go">object, as long as you don't use partition_cols, which creates multiple files.</span> |
| |
| <span class="gp">>>> </span><span class="kn">import</span><span class="w"> </span><span class="nn">io</span> |
| <span class="gp">>>> </span><span class="n">f</span> <span class="o">=</span> <span class="n">io</span><span class="o">.</span><span class="n">BytesIO</span><span class="p">()</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_parquet</span><span class="p">(</span><span class="n">f</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">f</span><span class="o">.</span><span class="n">seek</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> |
| <span class="go">0</span> |
| <span class="gp">>>> </span><span class="n">content</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">read</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="py function"> |
| <dt class="sig sig-object py" id="apache_beam.dataframe.io.to_stata"> |
| <span class="sig-prename descclassname"><span class="pre">apache_beam.dataframe.io.</span></span><span class="sig-name descname"><span class="pre">to_stata</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">df</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.to_stata" title="Link to this definition"></a></dt> |
| <dd><p>Export DataFrame object to Stata dta format.</p> |
| <p>Writes the DataFrame to a Stata dataset file. |
| “dta” files contain a Stata dataset.</p> |
| <dl class="field-list simple"> |
| <dt class="field-odd">Parameters<span class="colon">:</span></dt> |
| <dd class="field-odd"><ul class="simple"> |
| <li><p><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>path object</em><em>, or </em><em>buffer</em>) – String, path object (implementing <code class="docutils literal notranslate"><span class="pre">os.PathLike[str]</span></code>), or file-like |
| object implementing a binary <code class="docutils literal notranslate"><span class="pre">write()</span></code> function.</p></li> |
| <li><p><strong>convert_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a>) – Dictionary mapping columns containing datetime types to stata |
| internal format to use when writing the dates. Options are ‘tc’, |
| ‘td’, ‘tm’, ‘tw’, ‘th’, ‘tq’, ‘ty’. Column can be either an integer |
| or a name. Datetime columns that do not have a conversion type |
| specified will be converted to ‘tc’. Raises NotImplementedError if |
| a datetime column has timezone information.</p></li> |
| <li><p><strong>write_index</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a>) – Write the index to Stata dataset.</p></li> |
| <li><p><strong>byteorder</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – Can be “>”, “<”, “little”, or “big”. default is <cite>sys.byteorder</cite>.</p></li> |
| <li><p><strong>time_stamp</strong> (<em>datetime</em>) – A datetime to use as file creation date. Default is the current |
| time.</p></li> |
| <li><p><strong>data_label</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em>) – A label for the data set. Must be 80 characters or smaller.</p></li> |
| <li><p><strong>variable_labels</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a>) – Dictionary containing columns as keys and variable labels as |
| values. Each label must be 80 characters or smaller.</p></li> |
| <li><p><strong>version</strong> (<em>{114</em><em>, </em><em>117</em><em>, </em><em>118</em><em>, </em><em>119</em><em>, </em><em>None}</em><em>, </em><em>default 114</em>) – <p>Version to use in the output dta file. Set to None to let pandas |
| decide between 118 or 119 formats depending on the number of |
| columns in the frame. pandas Version 114 can be read by Stata 10 and |
| later. pandas Version 117 can be read by Stata 13 or later. pandas Version 118 |
| is supported in Stata 14 and later. pandas Version 119 is supported in |
| Stata 15 and later. pandas Version 114 limits string variables to 244 |
| characters or fewer while versions 117 and later allow strings |
| with lengths up to 2,000,000 characters. Versions 118 and 119 |
| support Unicode characters, and pandas version 119 supports more than |
| 32,767 variables.</p> |
| <p>pandas Version 119 should usually only be used when the number of |
| variables exceeds the capacity of dta format 118. Exporting |
| smaller datasets in format 119 may have unintended consequences, |
| and, as of November 2020, Stata SE cannot read pandas version 119 files.</p> |
| </p></li> |
| <li><p><strong>convert_strl</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>, </em><em>optional</em>) – List of column names to convert to string columns to Stata StrL |
| format. Only available if version is 117. Storing strings in the |
| StrL format can produce smaller dta files if strings have more than |
| 8 characters and values are repeated.</p></li> |
| <li><p><strong>compression</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>default 'infer'</em>) – <p>For on-the-fly compression of the output data. If ‘infer’ and ‘path’ is |
| path-like, then detect compression from the following extensions: ‘.gz’, |
| ‘.bz2’, ‘.zip’, ‘.xz’, ‘.zst’, ‘.tar’, ‘.tar.gz’, ‘.tar.xz’ or ‘.tar.bz2’ |
| (otherwise no compression). |
| Set to <code class="docutils literal notranslate"><span class="pre">None</span></code> for no compression. |
| Can also be a dict with key <code class="docutils literal notranslate"><span class="pre">'method'</span></code> set |
| to one of {<code class="docutils literal notranslate"><span class="pre">'zip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'gzip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'bz2'</span></code>, <code class="docutils literal notranslate"><span class="pre">'zstd'</span></code>, <code class="docutils literal notranslate"><span class="pre">'xz'</span></code>, <code class="docutils literal notranslate"><span class="pre">'tar'</span></code>} and |
| other key-value pairs are forwarded to |
| <code class="docutils literal notranslate"><span class="pre">zipfile.ZipFile</span></code>, <code class="docutils literal notranslate"><span class="pre">gzip.GzipFile</span></code>, |
| <code class="docutils literal notranslate"><span class="pre">bz2.BZ2File</span></code>, <code class="docutils literal notranslate"><span class="pre">zstandard.ZstdCompressor</span></code>, <code class="docutils literal notranslate"><span class="pre">lzma.LZMAFile</span></code> or |
| <code class="docutils literal notranslate"><span class="pre">tarfile.TarFile</span></code>, respectively. |
| As an example, the following could be passed for faster compression and to create |
| a reproducible gzip archive: |
| <code class="docutils literal notranslate"><span class="pre">compression={'method':</span> <span class="pre">'gzip',</span> <span class="pre">'compresslevel':</span> <span class="pre">1,</span> <span class="pre">'mtime':</span> <span class="pre">1}</span></code>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 1.5.0: </span>Added support for <cite>.tar</cite> files.</p> |
| </div> |
| <div class="versionchanged"> |
| <p><span class="versionmodified changed">Changed in version 1.4.0: </span>Zstandard support.</p> |
| </div> |
| </p></li> |
| <li><p><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| </p></li> |
| <li><p><strong>value_labels</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.13)"><em>dict</em></a><em> of </em><em>dicts</em>) – <p>Dictionary containing columns as keys and dictionaries of column value |
| to labels as values. Labels for a single variable must be 32,000 |
| characters or smaller.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified added">Added in version 1.4.0.</span></p> |
| </div> |
| </p></li> |
| </ul> |
| </dd> |
| <dt class="field-even">Raises<span class="colon">:</span></dt> |
| <dd class="field-even"><ul class="simple"> |
| <li><p><a class="reference external" href="https://docs.python.org/3/library/exceptions.html#NotImplementedError" title="(in Python v3.13)"><strong>NotImplementedError</strong></a> – <ul> |
| <li><p>If datetimes contain timezone information |
| * Column dtype is not representable in Stata</p></li> |
| </ul> |
| </p></li> |
| <li><p><a class="reference external" href="https://docs.python.org/3/library/exceptions.html#ValueError" title="(in Python v3.13)"><strong>ValueError</strong></a> – <ul> |
| <li><p>Columns listed in convert_dates are neither datetime64[ns] |
| or datetime.datetime |
| * Column listed in convert_dates is not in DeferredDataFrame |
| * Categorical label contains more than 32,000 characters</p></li> |
| </ul> |
| </p></li> |
| </ul> |
| </dd> |
| </dl> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="admonition-title">See also</p> |
| <dl class="simple"> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_stata" title="apache_beam.dataframe.io.read_stata"><code class="xref py py-obj docutils literal notranslate"><span class="pre">read_stata</span></code></a></dt><dd><p>Import Stata data files.</p> |
| </dd> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">io.stata.StataWriter</span></code></dt><dd><p>Low-level writer for Stata data files.</p> |
| </dd> |
| <dt><code class="xref py py-obj docutils literal notranslate"><span class="pre">io.stata.StataWriter117</span></code></dt><dd><p>Low-level writer for pandas version 117 files.</p> |
| </dd> |
| </dl> |
| </div> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">'animal'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'falcon'</span><span class="p">,</span> <span class="s1">'parrot'</span><span class="p">,</span> <span class="s1">'falcon'</span><span class="p">,</span> |
| <span class="gp">... </span> <span class="s1">'parrot'</span><span class="p">],</span> |
| <span class="gp">... </span> <span class="s1">'speed'</span><span class="p">:</span> <span class="p">[</span><span class="mi">350</span><span class="p">,</span> <span class="mi">18</span><span class="p">,</span> <span class="mi">361</span><span class="p">,</span> <span class="mi">15</span><span class="p">]})</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_stata</span><span class="p">(</span><span class="s1">'animals.dta'</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| </section> |
| |
| |
| </div> |
| </div> |
| <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer"> |
| <a href="apache_beam.dataframe.frames.html" class="btn btn-neutral float-left" title="apache_beam.dataframe.frames module" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a> |
| <a href="apache_beam.dataframe.pandas_top_level_functions.html" class="btn btn-neutral float-right" title="apache_beam.dataframe.pandas_top_level_functions module" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a> |
| </div> |
| |
| <hr/> |
| |
| <div role="contentinfo"> |
| <p>© Copyright %Y, Apache Beam.</p> |
| </div> |
| |
| Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a |
| <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> |
| provided by <a href="https://readthedocs.org">Read the Docs</a>. |
| |
| |
| </footer> |
| </div> |
| </div> |
| </section> |
| </div> |
| <script> |
| jQuery(function () { |
| SphinxRtdTheme.Navigation.enable(true); |
| }); |
| </script> |
| |
| </body> |
| </html> |