| |
| |
| <!DOCTYPE html> |
| <!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]--> |
| <!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]--> |
| <head> |
| <meta charset="utf-8"> |
| |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| |
| <title>apache_beam.dataframe.io module — Apache Beam 2.56.0 documentation</title> |
| |
| |
| |
| |
| |
| |
| |
| |
| <script type="text/javascript" src="_static/js/modernizr.min.js"></script> |
| |
| |
| <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script> |
| <script type="text/javascript" src="_static/jquery.js"></script> |
| <script type="text/javascript" src="_static/underscore.js"></script> |
| <script type="text/javascript" src="_static/doctools.js"></script> |
| <script type="text/javascript" src="_static/language_data.js"></script> |
| <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script> |
| |
| <script type="text/javascript" src="_static/js/theme.js"></script> |
| |
| |
| |
| |
| <link rel="stylesheet" href="_static/css/theme.css" type="text/css" /> |
| <link rel="stylesheet" href="_static/pygments.css" type="text/css" /> |
| <link rel="index" title="Index" href="genindex.html" /> |
| <link rel="search" title="Search" href="search.html" /> |
| <link rel="next" title="apache_beam.dataframe.pandas_top_level_functions module" href="apache_beam.dataframe.pandas_top_level_functions.html" /> |
| <link rel="prev" title="apache_beam.dataframe.frames module" href="apache_beam.dataframe.frames.html" /> |
| </head> |
| |
| <body class="wy-body-for-nav"> |
| |
| |
| <div class="wy-grid-for-nav"> |
| |
| <nav data-toggle="wy-nav-shift" class="wy-nav-side"> |
| <div class="wy-side-scroll"> |
| <div class="wy-side-nav-search" > |
| |
| |
| |
| <a href="index.html" class="icon icon-home"> Apache Beam |
| |
| |
| |
| </a> |
| |
| |
| |
| |
| <div class="version"> |
| 2.56.0 |
| </div> |
| |
| |
| |
| |
| <div role="search"> |
| <form id="rtd-search-form" class="wy-form" action="search.html" method="get"> |
| <input type="text" name="q" placeholder="Search docs" /> |
| <input type="hidden" name="check_keywords" value="yes" /> |
| <input type="hidden" name="area" value="default" /> |
| </form> |
| </div> |
| |
| |
| </div> |
| |
| <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation"> |
| |
| |
| |
| |
| |
| |
| <ul class="current"> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.coders.html">apache_beam.coders package</a></li> |
| <li class="toctree-l1 current"><a class="reference internal" href="apache_beam.dataframe.html">apache_beam.dataframe package</a><ul class="current"> |
| <li class="toctree-l2 current"><a class="reference internal" href="apache_beam.dataframe.html#submodules">Submodules</a><ul class="current"> |
| <li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.convert.html">apache_beam.dataframe.convert module</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.doctests.html">apache_beam.dataframe.doctests module</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.expressions.html">apache_beam.dataframe.expressions module</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.frame_base.html">apache_beam.dataframe.frame_base module</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.frames.html">apache_beam.dataframe.frames module</a></li> |
| <li class="toctree-l3 current"><a class="current reference internal" href="#">apache_beam.dataframe.io module</a><ul> |
| <li class="toctree-l4"><a class="reference internal" href="#sources">Sources</a></li> |
| <li class="toctree-l4"><a class="reference internal" href="#sinks">Sinks</a></li> |
| </ul> |
| </li> |
| <li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.pandas_top_level_functions.html">apache_beam.dataframe.pandas_top_level_functions module</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.partitionings.html">apache_beam.dataframe.partitionings module</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.schemas.html">apache_beam.dataframe.schemas module</a></li> |
| <li class="toctree-l3"><a class="reference internal" href="apache_beam.dataframe.transforms.html">apache_beam.dataframe.transforms module</a></li> |
| </ul> |
| </li> |
| </ul> |
| </li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.io.html">apache_beam.io package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.metrics.html">apache_beam.metrics package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.ml.html">apache_beam.ml package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.options.html">apache_beam.options package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.portability.html">apache_beam.portability package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.runners.html">apache_beam.runners package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.testing.html">apache_beam.testing package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.transforms.html">apache_beam.transforms package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.typehints.html">apache_beam.typehints package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.utils.html">apache_beam.utils package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.yaml.html">apache_beam.yaml package</a></li> |
| </ul> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.error.html">apache_beam.error module</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.pipeline.html">apache_beam.pipeline module</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="apache_beam.pvalue.html">apache_beam.pvalue module</a></li> |
| </ul> |
| |
| |
| |
| </div> |
| </div> |
| </nav> |
| |
| <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"> |
| |
| |
| <nav class="wy-nav-top" aria-label="top navigation"> |
| |
| <i data-toggle="wy-nav-top" class="fa fa-bars"></i> |
| <a href="index.html">Apache Beam</a> |
| |
| </nav> |
| |
| |
| <div class="wy-nav-content"> |
| |
| <div class="rst-content"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <div role="navigation" aria-label="breadcrumbs navigation"> |
| |
| <ul class="wy-breadcrumbs"> |
| |
| <li><a href="index.html">Docs</a> »</li> |
| |
| <li><a href="apache_beam.dataframe.html">apache_beam.dataframe package</a> »</li> |
| |
| <li>apache_beam.dataframe.io module</li> |
| |
| |
| <li class="wy-breadcrumbs-aside"> |
| |
| |
| <a href="_sources/apache_beam.dataframe.io.rst.txt" rel="nofollow"> View page source</a> |
| |
| |
| </li> |
| |
| </ul> |
| |
| |
| <hr/> |
| </div> |
| <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article"> |
| <div itemprop="articleBody"> |
| |
| <div class="section" id="module-apache_beam.dataframe.io"> |
| <span id="apache-beam-dataframe-io-module"></span><h1>apache_beam.dataframe.io module<a class="headerlink" href="#module-apache_beam.dataframe.io" title="Permalink to this headline">¶</a></h1> |
| <p>Sources and sinks for the Beam DataFrame API.</p> |
| <div class="section" id="sources"> |
| <h2>Sources<a class="headerlink" href="#sources" title="Permalink to this headline">¶</a></h2> |
| <p>This module provides analogs for pandas <code class="docutils literal notranslate"><span class="pre">read</span></code> methods, like |
| <a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.read_csv.html#pandas.read_csv" title="(in pandas v3.0.0.dev0+802.g7c836ed2ec)"><code class="xref py py-func docutils literal notranslate"><span class="pre">pandas.read_csv()</span></code></a>. However Beam sources like <a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a> |
| create a Beam <code class="xref py py-class docutils literal notranslate"><span class="pre">PTransform</span></code>, and return a |
| <a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame"><code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code></a> or |
| <a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredSeries" title="apache_beam.dataframe.frames.DeferredSeries"><code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredSeries</span></code></a> representing the contents |
| of the referenced file(s) or data source.</p> |
| <p>The result of these methods must be applied to a <code class="xref py py-class docutils literal notranslate"><span class="pre">Pipeline</span></code> |
| object, for example:</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">df</span> <span class="o">=</span> <span class="n">p</span> <span class="o">|</span> <span class="n">beam</span><span class="o">.</span><span class="n">dataframe</span><span class="o">.</span><span class="n">io</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="o">...</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="sinks"> |
| <h2>Sinks<a class="headerlink" href="#sinks" title="Permalink to this headline">¶</a></h2> |
| <p>This module also defines analogs for pandas sink, or <code class="docutils literal notranslate"><span class="pre">to</span></code>, methods that |
| generate a Beam <code class="xref py py-class docutils literal notranslate"><span class="pre">PTransform</span></code>. Users should prefer calling |
| these operations from <a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame"><code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code></a> |
| instances (for example with |
| <a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame.to_csv" title="apache_beam.dataframe.frames.DeferredDataFrame.to_csv"><code class="xref py py-meth docutils literal notranslate"><span class="pre">DeferredDataFrame.to_csv</span></code></a>).</p> |
| <dl class="function"> |
| <dt id="apache_beam.dataframe.io.read_gbq"> |
| <code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_gbq</code><span class="sig-paren">(</span><em>table</em>, <em>dataset=None</em>, <em>project_id=None</em>, <em>use_bqstorage_api=False</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#read_gbq"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.read_gbq" title="Permalink to this definition">¶</a></dt> |
| <dd><p>This function reads data from a BigQuery table and produces a |
| :class:<a href="#id1"><span class="problematic" id="id2">`</span></a>~apache_beam.dataframe.frames.DeferredDataFrame.</p> |
| <table class="docutils field-list" frame="void" rules="none"> |
| <col class="field-name" /> |
| <col class="field-body" /> |
| <tbody valign="top"> |
| <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple"> |
| <li><strong>table</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a>) – Please specify a table. This can be done in the format |
| ‘PROJECT:dataset.table’ if one would not wish to utilize |
| the parameters below.</li> |
| <li><strong>dataset</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a>) – Please specify the dataset |
| (can omit if table was specified as ‘PROJECT:dataset.table’).</li> |
| <li><strong>project_id</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a>) – Please specify the project ID |
| (can omit if table was specified as ‘PROJECT:dataset.table’).</li> |
| <li><strong>use_bqstorage_api</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a>) – If you would like to utilize |
| the BigQuery Storage API in ReadFromBigQuery, please set |
| this flag to true. Otherwise, please set flag |
| to false or leave it unspecified.</li> |
| </ul> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| </dd></dl> |
| |
| <dl class="function"> |
| <dt id="apache_beam.dataframe.io.read_csv"> |
| <code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_csv</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>splittable=False</em>, <em>binary=True</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#read_csv"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.read_csv" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Read a comma-separated values (csv) file into DataFrame.</p> |
| <p>Also supports optionally iterating or breaking of the file |
| into chunks.</p> |
| <p>Additional help can be found in the online docs for |
| <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html">IO Tools</a>.</p> |
| <table class="docutils field-list" frame="void" rules="none"> |
| <col class="field-name" /> |
| <col class="field-body" /> |
| <tbody valign="top"> |
| <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple"> |
| <li><strong>filepath_or_buffer</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>path object</em><em> or </em><em>file-like object</em>) – <p>Any valid string path is acceptable. The string could be a URL. Valid |
| URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is |
| expected. A local file could be: <a class="reference external" href="file://localhost/path/to/table.csv">file://localhost/path/to/table.csv</a>.</p> |
| <p>If you want to pass in a path object, pandas accepts any <code class="docutils literal notranslate"><span class="pre">os.PathLike</span></code>.</p> |
| <p>By file-like object, we refer to objects with a <code class="docutils literal notranslate"><span class="pre">read()</span></code> method, such as |
| a file handle (e.g. via builtin <code class="docutils literal notranslate"><span class="pre">open</span></code> function) or <code class="docutils literal notranslate"><span class="pre">StringIO</span></code>.</p> |
| </li> |
| <li><strong>sep</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default '</em><em>,</em><em>'</em>) – Delimiter to use. If sep is None, the C engine cannot automatically detect |
| the separator, but the Python parsing engine can, meaning the latter will |
| be used and automatically detect the separator by Python’s builtin sniffer |
| tool, <code class="docutils literal notranslate"><span class="pre">csv.Sniffer</span></code>. In addition, separators longer than 1 character and |
| different from <code class="docutils literal notranslate"><span class="pre">'\s+'</span></code> will be interpreted as regular expressions and |
| will also force the use of the Python parsing engine. Note that regex |
| delimiters are prone to ignoring quoted data. Regex example: <code class="docutils literal notranslate"><span class="pre">'\r\t'</span></code>.</li> |
| <li><strong>delimiter</strong> (str, default <code class="docutils literal notranslate"><span class="pre">None</span></code>) – Alias for sep.</li> |
| <li><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>list of int</em><em>, </em><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.12)"><em>None</em></a><em>, </em><em>default 'infer'</em>) – Row number(s) to use as the column names, and the start of the |
| data. Default behavior is to infer the column names: if no names |
| are passed the behavior is identical to <code class="docutils literal notranslate"><span class="pre">header=0</span></code> and column |
| names are inferred from the first line of the file, if column |
| names are passed explicitly then the behavior is identical to |
| <code class="docutils literal notranslate"><span class="pre">header=None</span></code>. Explicitly pass <code class="docutils literal notranslate"><span class="pre">header=0</span></code> to be able to |
| replace existing names. The header can be a list of integers that |
| specify row locations for a multi-index on the columns |
| e.g. [0,1,3]. Intervening rows that are not specified will be |
| skipped (e.g. 2 in this example is skipped). Note that this |
| parameter ignores commented lines and empty lines if |
| <code class="docutils literal notranslate"><span class="pre">skip_blank_lines=True</span></code>, so <code class="docutils literal notranslate"><span class="pre">header=0</span></code> denotes the first line of |
| data rather than the first line of the file.</li> |
| <li><strong>names</strong> (<em>array-like</em><em>, </em><em>optional</em>) – List of column names to use. If the file contains a header row, |
| then you should explicitly pass <code class="docutils literal notranslate"><span class="pre">header=0</span></code> to override the column names. |
| Duplicates in this list are not allowed.</li> |
| <li><strong>index_col</strong> (int, str, sequence of int / str, or False, optional, default <code class="docutils literal notranslate"><span class="pre">None</span></code>) – <p>Column(s) to use as the row labels of the <code class="docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code>, either given as |
| string name or column index. If a sequence of int / str is given, a |
| MultiIndex is used.</p> |
| <p>Note: <code class="docutils literal notranslate"><span class="pre">index_col=False</span></code> can be used to force pandas to <em>not</em> use the first |
| column as the index, e.g. when you have a malformed file with delimiters at |
| the end of each line.</p> |
| </li> |
| <li><strong>usecols</strong> (<em>list-like</em><em> or </em><em>callable</em><em>, </em><em>optional</em>) – <p>Return a subset of the columns. If list-like, all elements must either |
| be positional (i.e. integer indices into the document columns) or strings |
| that correspond to column names provided either by the user in <cite>names</cite> or |
| inferred from the document header row(s). If <code class="docutils literal notranslate"><span class="pre">names</span></code> are given, the document |
| header row(s) are not taken into account. For example, a valid list-like |
| <cite>usecols</cite> parameter would be <code class="docutils literal notranslate"><span class="pre">[0,</span> <span class="pre">1,</span> <span class="pre">2]</span></code> or <code class="docutils literal notranslate"><span class="pre">['foo',</span> <span class="pre">'bar',</span> <span class="pre">'baz']</span></code>. |
| Element order is ignored, so <code class="docutils literal notranslate"><span class="pre">usecols=[0,</span> <span class="pre">1]</span></code> is the same as <code class="docutils literal notranslate"><span class="pre">[1,</span> <span class="pre">0]</span></code>. |
| To instantiate a DeferredDataFrame from <code class="docutils literal notranslate"><span class="pre">data</span></code> with element order preserved use |
| <code class="docutils literal notranslate"><span class="pre">pd.read_csv(data,</span> <span class="pre">usecols=['foo',</span> <span class="pre">'bar'])[['foo',</span> <span class="pre">'bar']]</span></code> for columns |
| in <code class="docutils literal notranslate"><span class="pre">['foo',</span> <span class="pre">'bar']</span></code> order or |
| <code class="docutils literal notranslate"><span class="pre">pd.read_csv(data,</span> <span class="pre">usecols=['foo',</span> <span class="pre">'bar'])[['bar',</span> <span class="pre">'foo']]</span></code> |
| for <code class="docutils literal notranslate"><span class="pre">['bar',</span> <span class="pre">'foo']</span></code> order.</p> |
| <p>If callable, the callable function will be evaluated against the column |
| names, returning names where the callable function evaluates to True. An |
| example of a valid callable argument would be <code class="docutils literal notranslate"><span class="pre">lambda</span> <span class="pre">x:</span> <span class="pre">x.upper()</span> <span class="pre">in</span> |
| <span class="pre">['AAA',</span> <span class="pre">'BBB',</span> <span class="pre">'DDD']</span></code>. Using this parameter results in much faster |
| parsing time and lower memory usage.</p> |
| </li> |
| <li><strong>dtype</strong> (<em>Type name</em><em> or </em><em>dict of column -> type</em><em>, </em><em>optional</em>) – <p>Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32, |
| ‘c’: ‘Int64’} |
| Use <cite>str</cite> or <cite>object</cite> together with suitable <cite>na_values</cite> settings |
| to preserve and not interpret dtype. |
| If converters are specified, they will be applied INSTEAD |
| of dtype conversion.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.5.0: </span>Support for defaultdict was added. Specify a defaultdict as input where |
| the default determines the dtype of the columns which are not explicitly |
| listed.</p> |
| </div> |
| </li> |
| <li><strong>engine</strong> (<em>{'c'</em><em>, </em><em>'python'</em><em>, </em><em>'pyarrow'}</em><em>, </em><em>optional</em>) – <p>Parser engine to use. The C and pyarrow engines are faster, while the python engine |
| is currently more feature-complete. Multithreading is currently only supported by |
| the pyarrow engine.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.4.0: </span>The “pyarrow” engine was added as an <em>experimental</em> engine, and some features |
| are unsupported, or may not work correctly, with this engine.</p> |
| </div> |
| </li> |
| <li><strong>converters</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>optional</em>) – Dict of functions for converting values in certain columns. Keys can either |
| be integers or column labels.</li> |
| <li><strong>true_values</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.12)"><em>list</em></a><em>, </em><em>optional</em>) – Values to consider as True in addition to case-insensitive variants of “True”.</li> |
| <li><strong>false_values</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.12)"><em>list</em></a><em>, </em><em>optional</em>) – Values to consider as False in addition to case-insensitive variants of “False”.</li> |
| <li><strong>skipinitialspace</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default False</em>) – Skip spaces after delimiter.</li> |
| <li><strong>skiprows</strong> (<em>list-like</em><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em> or </em><em>callable</em><em>, </em><em>optional</em>) – <p>Line numbers to skip (0-indexed) or number of lines to skip (int) |
| at the start of the file.</p> |
| <p>If callable, the callable function will be evaluated against the row |
| indices, returning True if the row should be skipped and False otherwise. |
| An example of a valid callable argument would be <code class="docutils literal notranslate"><span class="pre">lambda</span> <span class="pre">x:</span> <span class="pre">x</span> <span class="pre">in</span> <span class="pre">[0,</span> <span class="pre">2]</span></code>.</p> |
| </li> |
| <li><strong>skipfooter</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>default 0</em>) – Number of lines at bottom of file to skip (Unsupported with engine=’c’).</li> |
| <li><strong>nrows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>optional</em>) – Number of rows of file to read. Useful for reading pieces of large files.</li> |
| <li><strong>na_values</strong> (<em>scalar</em><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>list-like</em><em>, or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>optional</em>) – Additional strings to recognize as NA/NaN. If dict passed, specific |
| per-column NA values. By default the following values are interpreted as |
| NaN: ‘’, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, ‘-nan’, |
| ‘1.#IND’, ‘1.#QNAN’, ‘<NA>’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘None’, |
| ‘n/a’, ‘nan’, ‘null’.</li> |
| <li><strong>keep_default_na</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – <p>Whether or not to include the default NaN values when parsing the data. |
| Depending on whether <cite>na_values</cite> is passed in, the behavior is as follows:</p> |
| <ul> |
| <li>If <cite>keep_default_na</cite> is True, and <cite>na_values</cite> are specified, <cite>na_values</cite> |
| is appended to the default NaN values used for parsing.</li> |
| <li>If <cite>keep_default_na</cite> is True, and <cite>na_values</cite> are not specified, only |
| the default NaN values are used for parsing.</li> |
| <li>If <cite>keep_default_na</cite> is False, and <cite>na_values</cite> are specified, only |
| the NaN values specified <cite>na_values</cite> are used for parsing.</li> |
| <li>If <cite>keep_default_na</cite> is False, and <cite>na_values</cite> are not specified, no |
| strings will be parsed as NaN.</li> |
| </ul> |
| <p>Note that if <cite>na_filter</cite> is passed in as False, the <cite>keep_default_na</cite> and |
| <cite>na_values</cite> parameters will be ignored.</p> |
| </li> |
| <li><strong>na_filter</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – Detect missing value markers (empty strings and the value of na_values). In |
| data without any NAs, passing na_filter=False can improve the performance |
| of reading a large file.</li> |
| <li><strong>verbose</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default False</em>) – Indicate number of NA values placed in non-numeric columns.</li> |
| <li><strong>skip_blank_lines</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – If True, skip over blank lines rather than interpreting as NaN values.</li> |
| <li><strong>parse_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em> or </em><em>list of int</em><em> or </em><em>names</em><em> or </em><em>list of lists</em><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>default False</em>) – <p>The behavior is as follows:</p> |
| <ul> |
| <li>boolean. If True -> try parsing the index.</li> |
| <li>list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 |
| each as a separate date column.</li> |
| <li>list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as |
| a single date column.</li> |
| <li>dict, e.g. {‘foo’ : [1, 3]} -> parse columns 1, 3 as date and call |
| result ‘foo’</li> |
| </ul> |
| <p>If a column or index cannot be represented as an array of datetimes, |
| say because of an unparsable value or a mixture of timezones, the column |
| or index will be returned unaltered as an object data type. For |
| non-standard datetime parsing, use <code class="docutils literal notranslate"><span class="pre">pd.to_datetime</span></code> after |
| <code class="docutils literal notranslate"><span class="pre">pd.read_csv</span></code>.</p> |
| <p>Note: A fast-path exists for iso8601-formatted dates.</p> |
| </li> |
| <li><strong>infer_datetime_format</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default False</em>) – <p>If True and <cite>parse_dates</cite> is enabled, pandas will attempt to infer the |
| format of the datetime strings in the columns, and if it can be inferred, |
| switch to a faster method of parsing them. In some cases this can increase |
| the parsing speed by 5-10x.</p> |
| <div class="deprecated"> |
| <p><span class="versionmodified">Deprecated since version 2.0.0: </span>A strict version of this argument is now the default, passing it has no effect.</p> |
| </div> |
| </li> |
| <li><strong>keep_date_col</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default False</em>) – If True and <cite>parse_dates</cite> specifies combining multiple columns then |
| keep the original columns.</li> |
| <li><strong>date_parser</strong> (<em>function</em><em>, </em><em>optional</em>) – <p>Function to use for converting a sequence of string columns to an array of |
| datetime instances. The default uses <code class="docutils literal notranslate"><span class="pre">dateutil.parser.parser</span></code> to do the |
| conversion. Pandas will try to call <cite>date_parser</cite> in three different ways, |
| advancing to the next if an exception occurs: 1) Pass one or more arrays |
| (as defined by <cite>parse_dates</cite>) as arguments; 2) concatenate (row-wise) the |
| string values from the columns defined by <cite>parse_dates</cite> into a single array |
| and pass that; and 3) call <cite>date_parser</cite> once for each row using one or |
| more strings (corresponding to the columns defined by <cite>parse_dates</cite>) as |
| arguments.</p> |
| <div class="deprecated"> |
| <p><span class="versionmodified">Deprecated since version 2.0.0: </span>Use <code class="docutils literal notranslate"><span class="pre">date_format</span></code> instead, or read in as <code class="docutils literal notranslate"><span class="pre">object</span></code> and then apply |
| <code class="xref py py-func docutils literal notranslate"><span class="pre">to_datetime()</span></code> as-needed.</p> |
| </div> |
| </li> |
| <li><strong>date_format</strong> (str or dict of column -> format, default <code class="docutils literal notranslate"><span class="pre">None</span></code>) – <p>If used in conjunction with <code class="docutils literal notranslate"><span class="pre">parse_dates</span></code>, will parse dates according to this |
| format. For anything more complex, |
| please read in as <code class="docutils literal notranslate"><span class="pre">object</span></code> and then apply <code class="xref py py-func docutils literal notranslate"><span class="pre">to_datetime()</span></code> as-needed.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 2.0.0.</span></p> |
| </div> |
| </li> |
| <li><strong>dayfirst</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default False</em>) – DD/MM format dates, international and European format.</li> |
| <li><strong>cache_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – If True, use a cache of unique, converted dates to apply the datetime |
| conversion. May produce significant speed-up when parsing duplicate |
| date strings, especially ones with timezone offsets.</li> |
| <li><strong>iterator</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default False</em>) – <p>Return TextFileReader object for iteration or getting chunks with |
| <code class="docutils literal notranslate"><span class="pre">get_chunk()</span></code>.</p> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.2: </span><code class="docutils literal notranslate"><span class="pre">TextFileReader</span></code> is a context manager.</p> |
| </div> |
| </li> |
| <li><strong>chunksize</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>optional</em>) – <p>Return TextFileReader object for iteration. |
| See the <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking">IO Tools docs</a> |
| for more information on <code class="docutils literal notranslate"><span class="pre">iterator</span></code> and <code class="docutils literal notranslate"><span class="pre">chunksize</span></code>.</p> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.2: </span><code class="docutils literal notranslate"><span class="pre">TextFileReader</span></code> is a context manager.</p> |
| </div> |
| </li> |
| <li><strong>compression</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>default 'infer'</em>) – <p>For on-the-fly decompression of on-disk data. If ‘infer’ and ‘filepath_or_buffer’ is |
| path-like, then detect compression from the following extensions: ‘.gz’, |
| ‘.bz2’, ‘.zip’, ‘.xz’, ‘.zst’, ‘.tar’, ‘.tar.gz’, ‘.tar.xz’ or ‘.tar.bz2’ |
| (otherwise no compression). |
| If using ‘zip’ or ‘tar’, the ZIP file must contain only one data file to be read in. |
| Set to <code class="docutils literal notranslate"><span class="pre">None</span></code> for no decompression. |
| Can also be a dict with key <code class="docutils literal notranslate"><span class="pre">'method'</span></code> set |
| to one of {<code class="docutils literal notranslate"><span class="pre">'zip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'gzip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'bz2'</span></code>, <code class="docutils literal notranslate"><span class="pre">'zstd'</span></code>, <code class="docutils literal notranslate"><span class="pre">'tar'</span></code>} and other |
| key-value pairs are forwarded to |
| <code class="docutils literal notranslate"><span class="pre">zipfile.ZipFile</span></code>, <code class="docutils literal notranslate"><span class="pre">gzip.GzipFile</span></code>, |
| <code class="docutils literal notranslate"><span class="pre">bz2.BZ2File</span></code>, <code class="docutils literal notranslate"><span class="pre">zstandard.ZstdDecompressor</span></code> or |
| <code class="docutils literal notranslate"><span class="pre">tarfile.TarFile</span></code>, respectively. |
| As an example, the following could be passed for Zstandard decompression using a |
| custom compression dictionary: |
| <code class="docutils literal notranslate"><span class="pre">compression={'method':</span> <span class="pre">'zstd',</span> <span class="pre">'dict_data':</span> <span class="pre">my_compression_dict}</span></code>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.5.0: </span>Added support for <cite>.tar</cite> files.</p> |
| </div> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.4.0: </span>Zstandard support.</p> |
| </div> |
| </li> |
| <li><strong>thousands</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em>) – Thousands separator.</li> |
| <li><strong>decimal</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default '.'</em>) – Character to recognize as decimal point (e.g. use ‘,’ for European data).</li> |
| <li><strong>lineterminator</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em> (</em><em>length 1</em><em>)</em><em>, </em><em>optional</em>) – Character to break file into lines. Only valid with C parser.</li> |
| <li><strong>quotechar</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em> (</em><em>length 1</em><em>)</em><em>, </em><em>optional</em>) – The character used to denote the start and end of a quoted item. Quoted |
| items can include the delimiter and it will be ignored.</li> |
| <li><strong>quoting</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em> or </em><em>csv.QUOTE_* instance</em><em>, </em><em>default 0</em>) – Control field quoting behavior per <code class="docutils literal notranslate"><span class="pre">csv.QUOTE_*</span></code> constants. Use one of |
| QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).</li> |
| <li><strong>doublequote</strong> (bool, default <code class="docutils literal notranslate"><span class="pre">True</span></code>) – When quotechar is specified and quoting is not <code class="docutils literal notranslate"><span class="pre">QUOTE_NONE</span></code>, indicate |
| whether or not to interpret two consecutive quotechar elements INSIDE a |
| field as a single <code class="docutils literal notranslate"><span class="pre">quotechar</span></code> element.</li> |
| <li><strong>escapechar</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em> (</em><em>length 1</em><em>)</em><em>, </em><em>optional</em>) – One-character string used to escape other characters.</li> |
| <li><strong>comment</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em>) – Indicates remainder of line should not be parsed. If found at the beginning |
| of a line, the line will be ignored altogether. This parameter must be a |
| single character. Like empty lines (as long as <code class="docutils literal notranslate"><span class="pre">skip_blank_lines=True</span></code>), |
| fully commented lines are ignored by the parameter <cite>header</cite> but not by |
| <cite>skiprows</cite>. For example, if <code class="docutils literal notranslate"><span class="pre">comment='#'</span></code>, parsing |
| <code class="docutils literal notranslate"><span class="pre">#empty\na,b,c\n1,2,3</span></code> with <code class="docutils literal notranslate"><span class="pre">header=0</span></code> will result in ‘a,b,c’ being |
| treated as the header.</li> |
| <li><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em><em>, </em><em>default "utf-8"</em>) – <p>Encoding to use for UTF when reading/writing (ex. ‘utf-8’). <a class="reference external" href="https://docs.python.org/3/library/codecs.html#standard-encodings">List of Python |
| standard encodings</a> .</p> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.2: </span>When <code class="docutils literal notranslate"><span class="pre">encoding</span></code> is <code class="docutils literal notranslate"><span class="pre">None</span></code>, <code class="docutils literal notranslate"><span class="pre">errors="replace"</span></code> is passed to |
| <code class="docutils literal notranslate"><span class="pre">open()</span></code>. Otherwise, <code class="docutils literal notranslate"><span class="pre">errors="strict"</span></code> is passed to <code class="docutils literal notranslate"><span class="pre">open()</span></code>. |
| This behavior was previously only the case for <code class="docutils literal notranslate"><span class="pre">engine="python"</span></code>.</p> |
| </div> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.3.0: </span><code class="docutils literal notranslate"><span class="pre">encoding_errors</span></code> is a new argument. <code class="docutils literal notranslate"><span class="pre">encoding</span></code> has no longer an |
| influence on how encoding errors are handled.</p> |
| </div> |
| </li> |
| <li><strong>encoding_errors</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em><em>, </em><em>default "strict"</em>) – <p>How encoding errors are treated. <a class="reference external" href="https://docs.python.org/3/library/codecs.html#error-handlers">List of possible values</a> .</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.3.0.</span></p> |
| </div> |
| </li> |
| <li><strong>dialect</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/csv.html#csv.Dialect" title="(in Python v3.12)"><em>csv.Dialect</em></a><em>, </em><em>optional</em>) – If provided, this parameter will override values (default or not) for the |
| following parameters: <cite>delimiter</cite>, <cite>doublequote</cite>, <cite>escapechar</cite>, |
| <cite>skipinitialspace</cite>, <cite>quotechar</cite>, and <cite>quoting</cite>. If it is necessary to |
| override values, a ParserWarning will be issued. See csv.Dialect |
| documentation for more details.</li> |
| <li><strong>on_bad_lines</strong> (<em>{'error'</em><em>, </em><em>'warn'</em><em>, </em><em>'skip'}</em><em> or </em><em>callable</em><em>, </em><em>default 'error'</em>) – <p>Specifies what to do upon encountering a bad line (a line with too many fields). |
| Allowed values are :</p> |
| <blockquote> |
| <div><ul> |
| <li>’error’, raise an Exception when a bad line is encountered.</li> |
| <li>’warn’, raise a warning when a bad line is encountered and skip that line.</li> |
| <li>’skip’, skip bad lines without raising or warning when they are encountered.</li> |
| </ul> |
| </div></blockquote> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.3.0.</span></p> |
| </div> |
| <div class="versionadded"> |
| <ul> |
| <span class="versionmodified">New in version 1.4.0: </span><li>callable, function with signature |
| <code class="docutils literal notranslate"><span class="pre">(bad_line:</span> <span class="pre">list[str])</span> <span class="pre">-></span> <span class="pre">list[str]</span> <span class="pre">|</span> <span class="pre">None</span></code> that will process a single |
| bad line. <code class="docutils literal notranslate"><span class="pre">bad_line</span></code> is a list of strings split by the <code class="docutils literal notranslate"><span class="pre">sep</span></code>. |
| If the function returns <code class="docutils literal notranslate"><span class="pre">None</span></code>, the bad line will be ignored. |
| If the function returns a new list of strings with more elements than |
| expected, a <code class="docutils literal notranslate"><span class="pre">ParserWarning</span></code> will be emitted while dropping extra elements. |
| Only supported when <code class="docutils literal notranslate"><span class="pre">engine="python"</span></code></li> |
| </ul> |
| </div> |
| </li> |
| <li><strong>delim_whitespace</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default False</em>) – Specifies whether or not whitespace (e.g. <code class="docutils literal notranslate"><span class="pre">'</span> <span class="pre">'</span></code> or <code class="docutils literal notranslate"><span class="pre">'</span>    <span class="pre">'</span></code>) will be |
| used as the sep. Equivalent to setting <code class="docutils literal notranslate"><span class="pre">sep='\s+'</span></code>. If this option |
| is set to True, nothing should be passed in for the <code class="docutils literal notranslate"><span class="pre">delimiter</span></code> |
| parameter.</li> |
| <li><strong>low_memory</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – Internally process the file in chunks, resulting in lower memory use |
| while parsing, but possibly mixed type inference. To ensure no mixed |
| types either set False, or specify the type with the <cite>dtype</cite> parameter. |
| Note that the entire file is read into a single DeferredDataFrame regardless, |
| use the <cite>chunksize</cite> or <cite>iterator</cite> parameter to return the data in chunks. |
| (Only valid with C parser).</li> |
| <li><strong>memory_map</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default False</em>) – If a filepath is provided for <cite>filepath_or_buffer</cite>, map the file object |
| directly onto memory and access the data directly from there. Using this |
| option can improve performance because there is no longer any I/O overhead.</li> |
| <li><strong>float_precision</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em>) – <p>Specifies which converter the C engine should use for floating-point |
| values. The options are <code class="docutils literal notranslate"><span class="pre">None</span></code> or ‘high’ for the ordinary converter, |
| ‘legacy’ for the original lower precision pandas converter, and |
| ‘round_trip’ for the round-trip converter.</p> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.2.</span></p> |
| </div> |
| </li> |
| <li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.2.</span></p> |
| </div> |
| </li> |
| <li><strong>dtype_backend</strong> (<em>{"numpy_nullable"</em><em>, </em><em>"pyarrow"}</em><em>, </em><em>defaults to NumPy backed DeferredDataFrames</em>) – <p>Which dtype_backend to use, e.g. whether a DeferredDataFrame should have NumPy |
| arrays, nullable dtypes are used for all dtypes that have a nullable |
| implementation when “numpy_nullable” is set, pyarrow is used for all |
| dtypes if “pyarrow” is set.</p> |
| <p>The dtype_backends are still experimential.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 2.0.</span></p> |
| </div> |
| </li> |
| </ul> |
| </td> |
| </tr> |
| <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">A comma-separated values (csv) file is returned as two-dimensional |
| data structure with labeled axes.</p> |
| </td> |
| </tr> |
| <tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a> or TextFileReader</p> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| <p class="rubric">Differences from pandas</p> |
| <p>If your files are large and records do not contain quoted newlines, you may |
| pass the extra argument <code class="docutils literal notranslate"><span class="pre">splittable=True</span></code> to enable dynamic splitting for |
| this read on newlines. Using this option for records that do contain quoted |
| newlines may result in partial records and data corruption.</p> |
| <div class="admonition seealso"> |
| <p class="first admonition-title">See also</p> |
| <dl class="last docutils"> |
| <dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_csv()</span></code></dt> |
| <dd>Write DeferredDataFrame to a comma-separated values (csv) file.</dd> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a></dt> |
| <dd>Read a comma-separated values (csv) file into DeferredDataFrame.</dd> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_fwf" title="apache_beam.dataframe.io.read_fwf"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_fwf()</span></code></a></dt> |
| <dd>Read a table of fixed-width formatted lines into DeferredDataFrame.</dd> |
| </dl> |
| </div> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API. In addition, some arguments shown here may not be supported, see <strong>‘Differences from pandas’</strong> for details.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">'data.csv'</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="function"> |
| <dt id="apache_beam.dataframe.io.to_csv"> |
| <code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">to_csv</code><span class="sig-paren">(</span><em>df</em>, <em>path</em>, <em>transform_label=None</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#to_csv"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.to_csv" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Write object to a comma-separated values (csv) file.</p> |
| <table class="docutils field-list" frame="void" rules="none"> |
| <col class="field-name" /> |
| <col class="field-body" /> |
| <tbody valign="top"> |
| <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple"> |
| <li><strong>path_or_buf</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>path object</em><em>, </em><em>file-like object</em><em>, or </em><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.12)"><em>None</em></a><em>, </em><em>default None</em>) – <p>String, path object (implementing os.PathLike[str]), or file-like |
| object implementing a write() function. If None, the result is |
| returned as a string. If a non-binary file object is passed, it should |
| be opened with <cite>newline=’’</cite>, disabling universal newlines. If a binary |
| file object is passed, <cite>mode</cite> might need to contain a <cite>‘b’</cite>.</p> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.2.0: </span>Support for binary file objects was introduced.</p> |
| </div> |
| </li> |
| <li><strong>sep</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default '</em><em>,</em><em>'</em>) – String of length 1. Field delimiter for the output file.</li> |
| <li><strong>na_rep</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default ''</em>) – Missing data representation.</li> |
| <li><strong>float_format</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>Callable</em><em>, </em><em>default None</em>) – Format string for floating point numbers. If a Callable is given, it takes |
| precedence over other numeric formatting parameters, like decimal.</li> |
| <li><strong>columns</strong> (<em>sequence</em><em>, </em><em>optional</em>) – Columns to write.</li> |
| <li><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em> or </em><em>list of str</em><em>, </em><em>default True</em>) – Write out the column names. If a list of strings is given it is |
| assumed to be aliases for the column names.</li> |
| <li><strong>index</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – Write row names (index).</li> |
| <li><strong>index_label</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em> or </em><em>sequence</em><em>, or </em><em>False</em><em>, </em><em>default None</em>) – Column label for index column(s) if desired. If None is given, and |
| <cite>header</cite> and <cite>index</cite> are True, then the index names are used. A |
| sequence should be given if the object uses MultiIndex. If |
| False do not print fields for index names. Use index_label=False |
| for easier importing in R.</li> |
| <li><strong>mode</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default 'w'</em>) – Python write mode. The available write modes are the same as |
| <a class="reference external" href="https://docs.python.org/3/library/functions.html#open" title="(in Python v3.12)"><code class="xref py py-func docutils literal notranslate"><span class="pre">open()</span></code></a>.</li> |
| <li><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em>) – A string representing the encoding to use in the output file, |
| defaults to ‘utf-8’. <cite>encoding</cite> is not supported if <cite>path_or_buf</cite> |
| is a non-binary file object.</li> |
| <li><strong>compression</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>default 'infer'</em>) – <p>For on-the-fly compression of the output data. If ‘infer’ and ‘path_or_buf’ is |
| path-like, then detect compression from the following extensions: ‘.gz’, |
| ‘.bz2’, ‘.zip’, ‘.xz’, ‘.zst’, ‘.tar’, ‘.tar.gz’, ‘.tar.xz’ or ‘.tar.bz2’ |
| (otherwise no compression). |
| Set to <code class="docutils literal notranslate"><span class="pre">None</span></code> for no compression. |
| Can also be a dict with key <code class="docutils literal notranslate"><span class="pre">'method'</span></code> set |
| to one of {<code class="docutils literal notranslate"><span class="pre">'zip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'gzip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'bz2'</span></code>, <code class="docutils literal notranslate"><span class="pre">'zstd'</span></code>, <code class="docutils literal notranslate"><span class="pre">'tar'</span></code>} and other |
| key-value pairs are forwarded to |
| <code class="docutils literal notranslate"><span class="pre">zipfile.ZipFile</span></code>, <code class="docutils literal notranslate"><span class="pre">gzip.GzipFile</span></code>, |
| <code class="docutils literal notranslate"><span class="pre">bz2.BZ2File</span></code>, <code class="docutils literal notranslate"><span class="pre">zstandard.ZstdCompressor</span></code> or |
| <code class="docutils literal notranslate"><span class="pre">tarfile.TarFile</span></code>, respectively. |
| As an example, the following could be passed for faster compression and to create |
| a reproducible gzip archive: |
| <code class="docutils literal notranslate"><span class="pre">compression={'method':</span> <span class="pre">'gzip',</span> <span class="pre">'compresslevel':</span> <span class="pre">1,</span> <span class="pre">'mtime':</span> <span class="pre">1}</span></code>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.5.0: </span>Added support for <cite>.tar</cite> files.</p> |
| </div> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.0.0: </span>May now be a dict with key ‘method’ as compression mode |
| and other entries as additional compression options if |
| compression mode is ‘zip’.</p> |
| </div> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.1.0: </span>Passing compression options as keys in dict is |
| supported for compression modes ‘gzip’, ‘bz2’, ‘zstd’, and ‘zip’.</p> |
| </div> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.2.0: </span>Compression is supported for binary file objects.</p> |
| </div> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.2.0: </span>Previous versions forwarded dict entries for ‘gzip’ to |
| <cite>gzip.open</cite> instead of <cite>gzip.GzipFile</cite> which prevented |
| setting <cite>mtime</cite>.</p> |
| </div> |
| </li> |
| <li><strong>quoting</strong> (<em>optional constant from csv module</em>) – Defaults to csv.QUOTE_MINIMAL. If you have set a <cite>float_format</cite> |
| then floats are converted to strings and thus csv.QUOTE_NONNUMERIC |
| will treat them as non-numeric.</li> |
| <li><strong>quotechar</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default '"'</em>) – String of length 1. Character used to quote fields.</li> |
| <li><strong>lineterminator</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em>) – <p>The newline character or character sequence to use in the output |
| file. Defaults to <cite>os.linesep</cite>, which depends on the OS in which |
| this method is called (‘\n’ for linux, ‘\r\n’ for Windows, i.e.).</p> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.5.0: </span>Previously was line_terminator, changed for consistency with |
| read_csv and the standard library ‘csv’ module.</p> |
| </div> |
| </li> |
| <li><strong>chunksize</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.12)"><em>None</em></a>) – Rows to write at a time.</li> |
| <li><strong>date_format</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default None</em>) – Format string for datetime objects.</li> |
| <li><strong>doublequote</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – Control quoting of <cite>quotechar</cite> inside a field.</li> |
| <li><strong>escapechar</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default None</em>) – String of length 1. Character used to escape <cite>sep</cite> and <cite>quotechar</cite> |
| when appropriate.</li> |
| <li><strong>decimal</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default '.'</em>) – Character recognized as decimal separator. E.g. use ‘,’ for |
| European data.</li> |
| <li><strong>errors</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default 'strict'</em>) – <p>Specifies how encoding and decoding errors are to be handled. |
| See the errors argument for <a class="reference external" href="https://docs.python.org/3/library/functions.html#open" title="(in Python v3.12)"><code class="xref py py-func docutils literal notranslate"><span class="pre">open()</span></code></a> for a full list |
| of options.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.1.0.</span></p> |
| </div> |
| </li> |
| <li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.2.0.</span></p> |
| </div> |
| </li> |
| </ul> |
| </td> |
| </tr> |
| <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">If path_or_buf is None, returns the resulting csv format as a |
| string. Otherwise returns None.</p> |
| </td> |
| </tr> |
| <tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.12)">None</a> or <a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)">str</a></p> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="first admonition-title">See also</p> |
| <dl class="last docutils"> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a></dt> |
| <dd>Load a CSV file into a DeferredDataFrame.</dd> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.to_excel" title="apache_beam.dataframe.io.to_excel"><code class="xref py py-func docutils literal notranslate"><span class="pre">to_excel()</span></code></a></dt> |
| <dd>Write DeferredDataFrame to an Excel file.</dd> |
| </dl> |
| </div> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">'name'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'Raphael'</span><span class="p">,</span> <span class="s1">'Donatello'</span><span class="p">],</span> |
| <span class="gp">... </span> <span class="s1">'mask'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'red'</span><span class="p">,</span> <span class="s1">'purple'</span><span class="p">],</span> |
| <span class="gp">... </span> <span class="s1">'weapon'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'sai'</span><span class="p">,</span> <span class="s1">'bo staff'</span><span class="p">]})</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| <span class="go">'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'</span> |
| |
| <span class="go">Create 'out.zip' containing 'out.csv'</span> |
| |
| <span class="gp">>>> </span><span class="n">compression_opts</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s1">'zip'</span><span class="p">,</span> |
| <span class="gp">... </span> <span class="n">archive_name</span><span class="o">=</span><span class="s1">'out.csv'</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="s1">'out.zip'</span><span class="p">,</span> <span class="n">index</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> |
| <span class="gp">... </span> <span class="n">compression</span><span class="o">=</span><span class="n">compression_opts</span><span class="p">)</span> |
| |
| <span class="go">To write a csv file to a new folder or nested folder you will first</span> |
| <span class="go">need to create it using either Pathlib or os:</span> |
| |
| <span class="gp">>>> </span><span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span> |
| <span class="gp">>>> </span><span class="n">filepath</span> <span class="o">=</span> <span class="n">Path</span><span class="p">(</span><span class="s1">'folder/subfolder/out.csv'</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">filepath</span><span class="o">.</span><span class="n">parent</span><span class="o">.</span><span class="n">mkdir</span><span class="p">(</span><span class="n">parents</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">exist_ok</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="n">filepath</span><span class="p">)</span> |
| |
| <span class="gp">>>> </span><span class="kn">import</span> <span class="nn">os</span> |
| <span class="gp">>>> </span><span class="n">os</span><span class="o">.</span><span class="n">makedirs</span><span class="p">(</span><span class="s1">'folder/subfolder'</span><span class="p">,</span> <span class="n">exist_ok</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span><span class="s1">'folder/subfolder/out.csv'</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="function"> |
| <dt id="apache_beam.dataframe.io.read_fwf"> |
| <code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_fwf</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#read_fwf"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.read_fwf" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Read a table of fixed-width formatted lines into DataFrame.</p> |
| <p>Also supports optionally iterating or breaking of the file |
| into chunks.</p> |
| <p>Additional help can be found in the <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html">online docs for IO Tools</a>.</p> |
| <table class="docutils field-list" frame="void" rules="none"> |
| <col class="field-name" /> |
| <col class="field-body" /> |
| <tbody valign="top"> |
| <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple"> |
| <li><strong>filepath_or_buffer</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>path object</em><em>, or </em><em>file-like object</em>) – String, path object (implementing <code class="docutils literal notranslate"><span class="pre">os.PathLike[str]</span></code>), or file-like |
| object implementing a text <code class="docutils literal notranslate"><span class="pre">read()</span></code> function.The string could be a URL. |
| Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is |
| expected. A local file could be: |
| <code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.csv</span></code>.</li> |
| <li><strong>colspecs</strong> (<em>list of tuple</em><em> (</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>) or </em><em>'infer'. optional</em>) – A list of tuples giving the extents of the fixed-width |
| fields of each line as half-open intervals (i.e., [from, to[ ). |
| String value ‘infer’ can be used to instruct the parser to try |
| detecting the column specifications from the first 100 rows of |
| the data which are not being skipped via skiprows (default=’infer’).</li> |
| <li><strong>widths</strong> (<em>list of int</em><em>, </em><em>optional</em>) – A list of field widths which can be used instead of ‘colspecs’ if |
| the intervals are contiguous.</li> |
| <li><strong>infer_nrows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>default 100</em>) – The number of rows to consider when letting the parser determine the |
| <cite>colspecs</cite>.</li> |
| <li><strong>dtype_backend</strong> (<em>{"numpy_nullable"</em><em>, </em><em>"pyarrow"}</em><em>, </em><em>defaults to NumPy backed DeferredDataFrames</em>) – <p>Which dtype_backend to use, e.g. whether a DeferredDataFrame should have NumPy |
| arrays, nullable dtypes are used for all dtypes that have a nullable |
| implementation when “numpy_nullable” is set, pyarrow is used for all |
| dtypes if “pyarrow” is set.</p> |
| <p>The dtype_backends are still experimential.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 2.0.</span></p> |
| </div> |
| </li> |
| <li><strong>**kwds</strong> (<em>optional</em>) – Optional keyword arguments can be passed to <code class="docutils literal notranslate"><span class="pre">TextFileReader</span></code>.</li> |
| </ul> |
| </td> |
| </tr> |
| <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">A comma-separated values (csv) file is returned as two-dimensional |
| data structure with labeled axes.</p> |
| </td> |
| </tr> |
| <tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a> or TextFileReader</p> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="first admonition-title">See also</p> |
| <dl class="last docutils"> |
| <dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_csv()</span></code></dt> |
| <dd>Write DeferredDataFrame to a comma-separated values (csv) file.</dd> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a></dt> |
| <dd>Read a comma-separated values (csv) file into DeferredDataFrame.</dd> |
| </dl> |
| </div> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_fwf</span><span class="p">(</span><span class="s1">'data.csv'</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="function"> |
| <dt id="apache_beam.dataframe.io.read_json"> |
| <code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_json</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#read_json"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.read_json" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Convert a JSON string to pandas object.</p> |
| <table class="docutils field-list" frame="void" rules="none"> |
| <col class="field-name" /> |
| <col class="field-body" /> |
| <tbody valign="top"> |
| <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple"> |
| <li><strong>path_or_buf</strong> (<em>a valid JSON str</em><em>, </em><em>path object</em><em> or </em><em>file-like object</em>) – <p>Any valid string path is acceptable. The string could be a URL. Valid |
| URL schemes include http, ftp, s3, and file. For file URLs, a host is |
| expected. A local file could be: |
| <code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.json</span></code>.</p> |
| <p>If you want to pass in a path object, pandas accepts any |
| <code class="docutils literal notranslate"><span class="pre">os.PathLike</span></code>.</p> |
| <p>By file-like object, we refer to objects with a <code class="docutils literal notranslate"><span class="pre">read()</span></code> method, |
| such as a file handle (e.g. via builtin <code class="docutils literal notranslate"><span class="pre">open</span></code> function) |
| or <code class="docutils literal notranslate"><span class="pre">StringIO</span></code>.</p> |
| </li> |
| <li><strong>orient</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em>) – <p>Indication of expected JSON string format. |
| Compatible JSON strings can be produced by <code class="docutils literal notranslate"><span class="pre">to_json()</span></code> with a |
| corresponding orient value. |
| The set of possible orients is:</p> |
| <ul> |
| <li><code class="docutils literal notranslate"><span class="pre">'split'</span></code> : dict like |
| <code class="docutils literal notranslate"><span class="pre">{index</span> <span class="pre">-></span> <span class="pre">[index],</span> <span class="pre">columns</span> <span class="pre">-></span> <span class="pre">[columns],</span> <span class="pre">data</span> <span class="pre">-></span> <span class="pre">[values]}</span></code></li> |
| <li><code class="docutils literal notranslate"><span class="pre">'records'</span></code> : list like |
| <code class="docutils literal notranslate"><span class="pre">[{column</span> <span class="pre">-></span> <span class="pre">value},</span> <span class="pre">...</span> <span class="pre">,</span> <span class="pre">{column</span> <span class="pre">-></span> <span class="pre">value}]</span></code></li> |
| <li><code class="docutils literal notranslate"><span class="pre">'index'</span></code> : dict like <code class="docutils literal notranslate"><span class="pre">{index</span> <span class="pre">-></span> <span class="pre">{column</span> <span class="pre">-></span> <span class="pre">value}}</span></code></li> |
| <li><code class="docutils literal notranslate"><span class="pre">'columns'</span></code> : dict like <code class="docutils literal notranslate"><span class="pre">{column</span> <span class="pre">-></span> <span class="pre">{index</span> <span class="pre">-></span> <span class="pre">value}}</span></code></li> |
| <li><code class="docutils literal notranslate"><span class="pre">'values'</span></code> : just the values array</li> |
| </ul> |
| <p>The allowed and default values depend on the value |
| of the <cite>typ</cite> parameter.</p> |
| <ul> |
| <li>when <code class="docutils literal notranslate"><span class="pre">typ</span> <span class="pre">==</span> <span class="pre">'series'</span></code>,<ul> |
| <li>allowed orients are <code class="docutils literal notranslate"><span class="pre">{'split','records','index'}</span></code></li> |
| <li>default is <code class="docutils literal notranslate"><span class="pre">'index'</span></code></li> |
| <li>The DeferredSeries index must be unique for orient <code class="docutils literal notranslate"><span class="pre">'index'</span></code>.</li> |
| </ul> |
| </li> |
| <li>when <code class="docutils literal notranslate"><span class="pre">typ</span> <span class="pre">==</span> <span class="pre">'frame'</span></code>,<ul> |
| <li>allowed orients are <code class="docutils literal notranslate"><span class="pre">{'split','records','index',</span> |
| <span class="pre">'columns','values',</span> <span class="pre">'table'}</span></code></li> |
| <li>default is <code class="docutils literal notranslate"><span class="pre">'columns'</span></code></li> |
| <li>The DeferredDataFrame index must be unique for orients <code class="docutils literal notranslate"><span class="pre">'index'</span></code> and |
| <code class="docutils literal notranslate"><span class="pre">'columns'</span></code>.</li> |
| <li>The DeferredDataFrame columns must be unique for orients <code class="docutils literal notranslate"><span class="pre">'index'</span></code>, |
| <code class="docutils literal notranslate"><span class="pre">'columns'</span></code>, and <code class="docutils literal notranslate"><span class="pre">'records'</span></code>.</li> |
| </ul> |
| </li> |
| </ul> |
| </li> |
| <li><strong>typ</strong> (<em>{'frame'</em><em>, </em><em>'series'}</em><em>, </em><em>default 'frame'</em>) – The type of object to recover.</li> |
| <li><strong>dtype</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>default None</em>) – <p>If True, infer dtypes; if a dict of column to dtype, then use those; |
| if False, then don’t infer dtypes at all, applies only to the data.</p> |
| <p>For all <code class="docutils literal notranslate"><span class="pre">orient</span></code> values except <code class="docutils literal notranslate"><span class="pre">'table'</span></code>, default is True.</p> |
| </li> |
| <li><strong>convert_axes</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default None</em>) – <p>Try to convert the axes to the proper dtypes.</p> |
| <p>For all <code class="docutils literal notranslate"><span class="pre">orient</span></code> values except <code class="docutils literal notranslate"><span class="pre">'table'</span></code>, default is True.</p> |
| </li> |
| <li><strong>convert_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em> or </em><em>list of str</em><em>, </em><em>default True</em>) – If True then default datelike columns may be converted (depending on |
| keep_default_dates). |
| If False, no dates will be converted. |
| If a list of column names, then those columns will be converted and |
| default datelike columns may also be converted (depending on |
| keep_default_dates).</li> |
| <li><strong>keep_default_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – <p>If parsing dates (convert_dates is not False), then try to parse the |
| default datelike columns. |
| A column label is datelike if</p> |
| <ul> |
| <li>it ends with <code class="docutils literal notranslate"><span class="pre">'_at'</span></code>,</li> |
| <li>it ends with <code class="docutils literal notranslate"><span class="pre">'_time'</span></code>,</li> |
| <li>it begins with <code class="docutils literal notranslate"><span class="pre">'timestamp'</span></code>,</li> |
| <li>it is <code class="docutils literal notranslate"><span class="pre">'modified'</span></code>, or</li> |
| <li>it is <code class="docutils literal notranslate"><span class="pre">'date'</span></code>.</li> |
| </ul> |
| </li> |
| <li><strong>precise_float</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default False</em>) – Set to enable usage of higher precision (strtod) function when |
| decoding string to double values. Default (False) is to use fast but |
| less precise builtin functionality.</li> |
| <li><strong>date_unit</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default None</em>) – The timestamp unit to detect if converting dates. The default behaviour |
| is to try and detect the correct precision, but if this is not desired |
| then pass one of ‘s’, ‘ms’, ‘us’ or ‘ns’ to force parsing only seconds, |
| milliseconds, microseconds or nanoseconds respectively.</li> |
| <li><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default is 'utf-8'</em>) – The encoding to use to decode py3 bytes.</li> |
| <li><strong>encoding_errors</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em><em>, </em><em>default "strict"</em>) – <p>How encoding errors are treated. <a class="reference external" href="https://docs.python.org/3/library/codecs.html#error-handlers">List of possible values</a> .</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.3.0.</span></p> |
| </div> |
| </li> |
| <li><strong>lines</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default False</em>) – Read the file as a json object per line.</li> |
| <li><strong>chunksize</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>optional</em>) – <p>Return JsonReader object for iteration. |
| See the <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#line-delimited-json">line-delimited json docs</a> |
| for more information on <code class="docutils literal notranslate"><span class="pre">chunksize</span></code>. |
| This can only be passed if <cite>lines=True</cite>. |
| If this is None, the file will be read into memory all at once.</p> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.2: </span><code class="docutils literal notranslate"><span class="pre">JsonReader</span></code> is a context manager.</p> |
| </div> |
| </li> |
| <li><strong>compression</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>default 'infer'</em>) – <p>For on-the-fly decompression of on-disk data. If ‘infer’ and ‘path_or_buf’ is |
| path-like, then detect compression from the following extensions: ‘.gz’, |
| ‘.bz2’, ‘.zip’, ‘.xz’, ‘.zst’, ‘.tar’, ‘.tar.gz’, ‘.tar.xz’ or ‘.tar.bz2’ |
| (otherwise no compression). |
| If using ‘zip’ or ‘tar’, the ZIP file must contain only one data file to be read in. |
| Set to <code class="docutils literal notranslate"><span class="pre">None</span></code> for no decompression. |
| Can also be a dict with key <code class="docutils literal notranslate"><span class="pre">'method'</span></code> set |
| to one of {<code class="docutils literal notranslate"><span class="pre">'zip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'gzip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'bz2'</span></code>, <code class="docutils literal notranslate"><span class="pre">'zstd'</span></code>, <code class="docutils literal notranslate"><span class="pre">'tar'</span></code>} and other |
| key-value pairs are forwarded to |
| <code class="docutils literal notranslate"><span class="pre">zipfile.ZipFile</span></code>, <code class="docutils literal notranslate"><span class="pre">gzip.GzipFile</span></code>, |
| <code class="docutils literal notranslate"><span class="pre">bz2.BZ2File</span></code>, <code class="docutils literal notranslate"><span class="pre">zstandard.ZstdDecompressor</span></code> or |
| <code class="docutils literal notranslate"><span class="pre">tarfile.TarFile</span></code>, respectively. |
| As an example, the following could be passed for Zstandard decompression using a |
| custom compression dictionary: |
| <code class="docutils literal notranslate"><span class="pre">compression={'method':</span> <span class="pre">'zstd',</span> <span class="pre">'dict_data':</span> <span class="pre">my_compression_dict}</span></code>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.5.0: </span>Added support for <cite>.tar</cite> files.</p> |
| </div> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.4.0: </span>Zstandard support.</p> |
| </div> |
| </li> |
| <li><strong>nrows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>optional</em>) – <p>The number of lines from the line-delimited jsonfile that has to be read. |
| This can only be passed if <cite>lines=True</cite>. |
| If this is None, all the rows will be returned.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.1.</span></p> |
| </div> |
| </li> |
| <li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.2.0.</span></p> |
| </div> |
| </li> |
| <li><strong>dtype_backend</strong> (<em>{"numpy_nullable"</em><em>, </em><em>"pyarrow"}</em><em>, </em><em>defaults to NumPy backed DeferredDataFrames</em>) – <p>Which dtype_backend to use, e.g. whether a DeferredDataFrame should have NumPy |
| arrays, nullable dtypes are used for all dtypes that have a nullable |
| implementation when “numpy_nullable” is set, pyarrow is used for all |
| dtypes if “pyarrow” is set.</p> |
| <p>The dtype_backends are still experimential.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 2.0.</span></p> |
| </div> |
| </li> |
| <li><strong>engine</strong> (<em>{"ujson"</em><em>, </em><em>"pyarrow"}</em><em>, </em><em>default "ujson"</em>) – <p>Parser engine to use. The <code class="docutils literal notranslate"><span class="pre">"pyarrow"</span></code> engine is only available when |
| <code class="docutils literal notranslate"><span class="pre">lines=True</span></code>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 2.0.</span></p> |
| </div> |
| </li> |
| </ul> |
| </td> |
| </tr> |
| <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">The type returned depends on the value of <cite>typ</cite>.</p> |
| </td> |
| </tr> |
| <tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredSeries" title="apache_beam.dataframe.frames.DeferredSeries">DeferredSeries</a> or <a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a></p> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="first admonition-title">See also</p> |
| <dl class="last docutils"> |
| <dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_json()</span></code></dt> |
| <dd>Convert a DeferredDataFrame to a JSON string.</dd> |
| <dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredSeries.to_json()</span></code></dt> |
| <dd>Convert a DeferredSeries to a JSON string.</dd> |
| <dt><code class="xref py py-func docutils literal notranslate"><span class="pre">json_normalize()</span></code></dt> |
| <dd>Normalize semi-structured JSON data into a flat table.</dd> |
| </dl> |
| </div> |
| <p class="rubric">Notes</p> |
| <p>Specific to <code class="docutils literal notranslate"><span class="pre">orient='table'</span></code>, if a <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> with a literal |
| <code class="xref py py-class docutils literal notranslate"><span class="pre">Index</span></code> name of <cite>index</cite> gets written with <a class="reference internal" href="#apache_beam.dataframe.io.to_json" title="apache_beam.dataframe.io.to_json"><code class="xref py py-func docutils literal notranslate"><span class="pre">to_json()</span></code></a>, the |
| subsequent read operation will incorrectly set the <code class="xref py py-class docutils literal notranslate"><span class="pre">Index</span></code> name to |
| <code class="docutils literal notranslate"><span class="pre">None</span></code>. This is because <cite>index</cite> is also used by <code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_json()</span></code> |
| to denote a missing <code class="xref py py-class docutils literal notranslate"><span class="pre">Index</span></code> name, and the subsequent |
| <a class="reference internal" href="#apache_beam.dataframe.io.read_json" title="apache_beam.dataframe.io.read_json"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_json()</span></code></a> operation cannot distinguish between the two. The same |
| limitation is encountered with a <code class="xref py py-class docutils literal notranslate"><span class="pre">MultiIndex</span></code> and any names |
| beginning with <code class="docutils literal notranslate"><span class="pre">'level_'</span></code>.</p> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">([[</span><span class="s1">'a'</span><span class="p">,</span> <span class="s1">'b'</span><span class="p">],</span> <span class="p">[</span><span class="s1">'c'</span><span class="p">,</span> <span class="s1">'d'</span><span class="p">]],</span> |
| <span class="gp">... </span> <span class="n">index</span><span class="o">=</span><span class="p">[</span><span class="s1">'row 1'</span><span class="p">,</span> <span class="s1">'row 2'</span><span class="p">],</span> |
| <span class="gp">... </span> <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">'col 1'</span><span class="p">,</span> <span class="s1">'col 2'</span><span class="p">])</span> |
| |
| <span class="go">Encoding/decoding a Dataframe using ``'split'`` formatted JSON:</span> |
| |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s1">'split'</span><span class="p">)</span> |
| <span class="go"> '{"columns":["col 1","col 2"],"index":["row 1","row 2"],"data":[["a","b"],["c","d"]]}'</span> |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_json</span><span class="p">(</span><span class="n">_</span><span class="p">,</span> <span class="n">orient</span><span class="o">=</span><span class="s1">'split'</span><span class="p">)</span> |
| <span class="go"> col 1 col 2</span> |
| <span class="go">row 1 a b</span> |
| <span class="go">row 2 c d</span> |
| |
| <span class="go">Encoding/decoding a Dataframe using ``'index'`` formatted JSON:</span> |
| |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s1">'index'</span><span class="p">)</span> |
| <span class="go">'{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'</span> |
| |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_json</span><span class="p">(</span><span class="n">_</span><span class="p">,</span> <span class="n">orient</span><span class="o">=</span><span class="s1">'index'</span><span class="p">)</span> |
| <span class="go"> col 1 col 2</span> |
| <span class="go">row 1 a b</span> |
| <span class="go">row 2 c d</span> |
| |
| <span class="go">Encoding/decoding a Dataframe using ``'records'`` formatted JSON.</span> |
| <span class="go">Note that index labels are not preserved with this encoding.</span> |
| |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s1">'records'</span><span class="p">)</span> |
| <span class="go">'[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'</span> |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_json</span><span class="p">(</span><span class="n">_</span><span class="p">,</span> <span class="n">orient</span><span class="o">=</span><span class="s1">'records'</span><span class="p">)</span> |
| <span class="go"> col 1 col 2</span> |
| <span class="go">0 a b</span> |
| <span class="go">1 c d</span> |
| |
| <span class="go">Encoding with Table Schema</span> |
| |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s1">'table'</span><span class="p">)</span> |
| <span class="go"> '{"schema":{"fields":[{"name":"index","type":"string"},{"name":"col 1","type":"string"},{"name":"col 2","type":"string"}],"primaryKey":["index"],"pandas_version":"1.4.0"},"data":[{"index":"row 1","col 1":"a","col 2":"b"},{"index":"row 2","col 1":"c","col 2":"d"}]}'</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="function"> |
| <dt id="apache_beam.dataframe.io.to_json"> |
| <code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">to_json</code><span class="sig-paren">(</span><em>df</em>, <em>path</em>, <em>orient=None</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#to_json"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.to_json" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Convert the object to a JSON string.</p> |
| <p>Note NaN’s and None will be converted to null and datetime objects |
| will be converted to UNIX timestamps.</p> |
| <table class="docutils field-list" frame="void" rules="none"> |
| <col class="field-name" /> |
| <col class="field-body" /> |
| <tbody valign="top"> |
| <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple"> |
| <li><strong>path_or_buf</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>path object</em><em>, </em><em>file-like object</em><em>, or </em><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.12)"><em>None</em></a><em>, </em><em>default None</em>) – String, path object (implementing os.PathLike[str]), or file-like |
| object implementing a write() function. If None, the result is |
| returned as a string.</li> |
| <li><strong>orient</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a>) – <p>Indication of expected JSON string format.</p> |
| <ul> |
| <li>DeferredSeries:<blockquote> |
| <div><ul> |
| <li>default is ‘index’</li> |
| <li>allowed values are: {‘split’, ‘records’, ‘index’, ‘table’}.</li> |
| </ul> |
| </div></blockquote> |
| </li> |
| <li>DeferredDataFrame:<blockquote> |
| <div><ul> |
| <li>default is ‘columns’</li> |
| <li>allowed values are: {‘split’, ‘records’, ‘index’, ‘columns’, |
| ‘values’, ‘table’}.</li> |
| </ul> |
| </div></blockquote> |
| </li> |
| <li>The format of the JSON string:<blockquote> |
| <div><ul> |
| <li>’split’ : dict like {‘index’ -> [index], ‘columns’ -> [columns], |
| ‘data’ -> [values]}</li> |
| <li>’records’ : list like [{column -> value}, … , {column -> value}]</li> |
| <li>’index’ : dict like {index -> {column -> value}}</li> |
| <li>’columns’ : dict like {column -> {index -> value}}</li> |
| <li>’values’ : just the values array</li> |
| <li>’table’ : dict like {‘schema’: {schema}, ‘data’: {data}}</li> |
| </ul> |
| <p>Describing the data, where data component is like <code class="docutils literal notranslate"><span class="pre">orient='records'</span></code>.</p> |
| </div></blockquote> |
| </li> |
| </ul> |
| </li> |
| <li><strong>date_format</strong> (<em>{None</em><em>, </em><em>'epoch'</em><em>, </em><em>'iso'}</em>) – Type of date conversion. ‘epoch’ = epoch milliseconds, |
| ‘iso’ = ISO8601. The default depends on the <cite>orient</cite>. For |
| <code class="docutils literal notranslate"><span class="pre">orient='table'</span></code>, the default is ‘iso’. For all other orients, |
| the default is ‘epoch’.</li> |
| <li><strong>double_precision</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>default 10</em>) – The number of decimal places to use when encoding |
| floating point values.</li> |
| <li><strong>force_ascii</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – Force encoded string to be ASCII.</li> |
| <li><strong>date_unit</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default 'ms'</em><em> (</em><em>milliseconds</em><em>)</em>) – The time unit to encode to, governs timestamp and ISO8601 |
| precision. One of ‘s’, ‘ms’, ‘us’, ‘ns’ for second, millisecond, |
| microsecond, and nanosecond respectively.</li> |
| <li><strong>default_handler</strong> (<em>callable</em><em>, </em><em>default None</em>) – Handler to call if object cannot otherwise be converted to a |
| suitable format for JSON. Should receive a single argument which is |
| the object to convert and return a serialisable object.</li> |
| <li><strong>lines</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default False</em>) – If ‘orient’ is ‘records’ write out line-delimited json format. Will |
| throw ValueError if incorrect ‘orient’ since others are not |
| list-like.</li> |
| <li><strong>compression</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>default 'infer'</em>) – <p>For on-the-fly compression of the output data. If ‘infer’ and ‘path_or_buf’ is |
| path-like, then detect compression from the following extensions: ‘.gz’, |
| ‘.bz2’, ‘.zip’, ‘.xz’, ‘.zst’, ‘.tar’, ‘.tar.gz’, ‘.tar.xz’ or ‘.tar.bz2’ |
| (otherwise no compression). |
| Set to <code class="docutils literal notranslate"><span class="pre">None</span></code> for no compression. |
| Can also be a dict with key <code class="docutils literal notranslate"><span class="pre">'method'</span></code> set |
| to one of {<code class="docutils literal notranslate"><span class="pre">'zip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'gzip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'bz2'</span></code>, <code class="docutils literal notranslate"><span class="pre">'zstd'</span></code>, <code class="docutils literal notranslate"><span class="pre">'tar'</span></code>} and other |
| key-value pairs are forwarded to |
| <code class="docutils literal notranslate"><span class="pre">zipfile.ZipFile</span></code>, <code class="docutils literal notranslate"><span class="pre">gzip.GzipFile</span></code>, |
| <code class="docutils literal notranslate"><span class="pre">bz2.BZ2File</span></code>, <code class="docutils literal notranslate"><span class="pre">zstandard.ZstdCompressor</span></code> or |
| <code class="docutils literal notranslate"><span class="pre">tarfile.TarFile</span></code>, respectively. |
| As an example, the following could be passed for faster compression and to create |
| a reproducible gzip archive: |
| <code class="docutils literal notranslate"><span class="pre">compression={'method':</span> <span class="pre">'gzip',</span> <span class="pre">'compresslevel':</span> <span class="pre">1,</span> <span class="pre">'mtime':</span> <span class="pre">1}</span></code>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.5.0: </span>Added support for <cite>.tar</cite> files.</p> |
| </div> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.4.0: </span>Zstandard support.</p> |
| </div> |
| </li> |
| <li><strong>index</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – Whether to include the index values in the JSON string. Not |
| including the index (<code class="docutils literal notranslate"><span class="pre">index=False</span></code>) is only supported when |
| orient is ‘split’ or ‘table’.</li> |
| <li><strong>indent</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>optional</em>) – Length of whitespace used to indent each record.</li> |
| <li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.2.0.</span></p> |
| </div> |
| </li> |
| <li><strong>mode</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default 'w'</em><em> (</em><em>writing</em><em>)</em>) – Specify the IO mode for output when supplying a path_or_buf. |
| Accepted args are ‘w’ (writing) and ‘a’ (append) only. |
| mode=’a’ is only supported when lines is True and orient is ‘records’.</li> |
| </ul> |
| </td> |
| </tr> |
| <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">If path_or_buf is None, returns the resulting json format as a |
| string. Otherwise returns None.</p> |
| </td> |
| </tr> |
| <tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.12)">None</a> or <a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)">str</a></p> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="first admonition-title">See also</p> |
| <dl class="last docutils"> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_json" title="apache_beam.dataframe.io.read_json"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_json()</span></code></a></dt> |
| <dd>Convert a JSON string to pandas object.</dd> |
| </dl> |
| </div> |
| <p class="rubric">Notes</p> |
| <p>The behavior of <code class="docutils literal notranslate"><span class="pre">indent=0</span></code> varies from the stdlib, which does not |
| indent the output but does insert newlines. Currently, <code class="docutils literal notranslate"><span class="pre">indent=0</span></code> |
| and the default <code class="docutils literal notranslate"><span class="pre">indent=None</span></code> are equivalent in pandas, though this |
| may change in a future release.</p> |
| <p><code class="docutils literal notranslate"><span class="pre">orient='table'</span></code> contains a ‘pandas_version’ field under ‘schema’. |
| This stores the version of <cite>pandas</cite> used in the latest revision of the |
| schema.</p> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="kn">from</span> <span class="nn">json</span> <span class="kn">import</span> <span class="n">loads</span><span class="p">,</span> <span class="n">dumps</span> |
| <span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span> |
| <span class="gp">... </span> <span class="p">[[</span><span class="s2">"a"</span><span class="p">,</span> <span class="s2">"b"</span><span class="p">],</span> <span class="p">[</span><span class="s2">"c"</span><span class="p">,</span> <span class="s2">"d"</span><span class="p">]],</span> |
| <span class="gp">... </span> <span class="n">index</span><span class="o">=</span><span class="p">[</span><span class="s2">"row 1"</span><span class="p">,</span> <span class="s2">"row 2"</span><span class="p">],</span> |
| <span class="gp">... </span> <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s2">"col 1"</span><span class="p">,</span> <span class="s2">"col 2"</span><span class="p">],</span> |
| <span class="gp">... </span><span class="p">)</span> |
| |
| <span class="gp">>>> </span><span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">"split"</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">parsed</span> <span class="o">=</span> <span class="n">loads</span><span class="p">(</span><span class="n">result</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">dumps</span><span class="p">(</span><span class="n">parsed</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span> |
| <span class="go">{</span> |
| <span class="go"> "columns": [</span> |
| <span class="go"> "col 1",</span> |
| <span class="go"> "col 2"</span> |
| <span class="go"> ],</span> |
| <span class="go"> "index": [</span> |
| <span class="go"> "row 1",</span> |
| <span class="go"> "row 2"</span> |
| <span class="go"> ],</span> |
| <span class="go"> "data": [</span> |
| <span class="go"> [</span> |
| <span class="go"> "a",</span> |
| <span class="go"> "b"</span> |
| <span class="go"> ],</span> |
| <span class="go"> [</span> |
| <span class="go"> "c",</span> |
| <span class="go"> "d"</span> |
| <span class="go"> ]</span> |
| <span class="go"> ]</span> |
| <span class="go">}</span> |
| |
| <span class="go">Encoding/decoding a Dataframe using ``'records'`` formatted JSON.</span> |
| <span class="go">Note that index labels are not preserved with this encoding.</span> |
| |
| <span class="gp">>>> </span><span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">"records"</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">parsed</span> <span class="o">=</span> <span class="n">loads</span><span class="p">(</span><span class="n">result</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">dumps</span><span class="p">(</span><span class="n">parsed</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span> |
| <span class="go">[</span> |
| <span class="go"> {</span> |
| <span class="go"> "col 1": "a",</span> |
| <span class="go"> "col 2": "b"</span> |
| <span class="go"> },</span> |
| <span class="go"> {</span> |
| <span class="go"> "col 1": "c",</span> |
| <span class="go"> "col 2": "d"</span> |
| <span class="go"> }</span> |
| <span class="go">]</span> |
| |
| <span class="go">Encoding/decoding a Dataframe using ``'index'`` formatted JSON:</span> |
| |
| <span class="gp">>>> </span><span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">"index"</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">parsed</span> <span class="o">=</span> <span class="n">loads</span><span class="p">(</span><span class="n">result</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">dumps</span><span class="p">(</span><span class="n">parsed</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span> |
| <span class="go">{</span> |
| <span class="go"> "row 1": {</span> |
| <span class="go"> "col 1": "a",</span> |
| <span class="go"> "col 2": "b"</span> |
| <span class="go"> },</span> |
| <span class="go"> "row 2": {</span> |
| <span class="go"> "col 1": "c",</span> |
| <span class="go"> "col 2": "d"</span> |
| <span class="go"> }</span> |
| <span class="go">}</span> |
| |
| <span class="go">Encoding/decoding a Dataframe using ``'columns'`` formatted JSON:</span> |
| |
| <span class="gp">>>> </span><span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">"columns"</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">parsed</span> <span class="o">=</span> <span class="n">loads</span><span class="p">(</span><span class="n">result</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">dumps</span><span class="p">(</span><span class="n">parsed</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span> |
| <span class="go">{</span> |
| <span class="go"> "col 1": {</span> |
| <span class="go"> "row 1": "a",</span> |
| <span class="go"> "row 2": "c"</span> |
| <span class="go"> },</span> |
| <span class="go"> "col 2": {</span> |
| <span class="go"> "row 1": "b",</span> |
| <span class="go"> "row 2": "d"</span> |
| <span class="go"> }</span> |
| <span class="go">}</span> |
| |
| <span class="go">Encoding/decoding a Dataframe using ``'values'`` formatted JSON:</span> |
| |
| <span class="gp">>>> </span><span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">"values"</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">parsed</span> <span class="o">=</span> <span class="n">loads</span><span class="p">(</span><span class="n">result</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">dumps</span><span class="p">(</span><span class="n">parsed</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span> |
| <span class="go">[</span> |
| <span class="go"> [</span> |
| <span class="go"> "a",</span> |
| <span class="go"> "b"</span> |
| <span class="go"> ],</span> |
| <span class="go"> [</span> |
| <span class="go"> "c",</span> |
| <span class="go"> "d"</span> |
| <span class="go"> ]</span> |
| <span class="go">]</span> |
| |
| <span class="go">Encoding with Table Schema:</span> |
| |
| <span class="gp">>>> </span><span class="n">result</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">"table"</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">parsed</span> <span class="o">=</span> <span class="n">loads</span><span class="p">(</span><span class="n">result</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">dumps</span><span class="p">(</span><span class="n">parsed</span><span class="p">,</span> <span class="n">indent</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span> |
| <span class="go">{</span> |
| <span class="go"> "schema": {</span> |
| <span class="go"> "fields": [</span> |
| <span class="go"> {</span> |
| <span class="go"> "name": "index",</span> |
| <span class="go"> "type": "string"</span> |
| <span class="go"> },</span> |
| <span class="go"> {</span> |
| <span class="go"> "name": "col 1",</span> |
| <span class="go"> "type": "string"</span> |
| <span class="go"> },</span> |
| <span class="go"> {</span> |
| <span class="go"> "name": "col 2",</span> |
| <span class="go"> "type": "string"</span> |
| <span class="go"> }</span> |
| <span class="go"> ],</span> |
| <span class="go"> "primaryKey": [</span> |
| <span class="go"> "index"</span> |
| <span class="go"> ],</span> |
| <span class="go"> "pandas_version": "1.4.0"</span> |
| <span class="go"> },</span> |
| <span class="go"> "data": [</span> |
| <span class="go"> {</span> |
| <span class="go"> "index": "row 1",</span> |
| <span class="go"> "col 1": "a",</span> |
| <span class="go"> "col 2": "b"</span> |
| <span class="go"> },</span> |
| <span class="go"> {</span> |
| <span class="go"> "index": "row 2",</span> |
| <span class="go"> "col 1": "c",</span> |
| <span class="go"> "col 2": "d"</span> |
| <span class="go"> }</span> |
| <span class="go"> ]</span> |
| <span class="go">}</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="function"> |
| <dt id="apache_beam.dataframe.io.read_html"> |
| <code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_html</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#read_html"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.read_html" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Read HTML tables into a <code class="docutils literal notranslate"><span class="pre">list</span></code> of <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code> objects.</p> |
| <table class="docutils field-list" frame="void" rules="none"> |
| <col class="field-name" /> |
| <col class="field-body" /> |
| <tbody valign="top"> |
| <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple"> |
| <li><strong>io</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>path object</em><em>, or </em><em>file-like object</em>) – String, path object (implementing <code class="docutils literal notranslate"><span class="pre">os.PathLike[str]</span></code>), or file-like |
| object implementing a string <code class="docutils literal notranslate"><span class="pre">read()</span></code> function. |
| The string can represent a URL or the HTML itself. Note that |
| lxml only accepts the http, ftp and file url protocols. If you have a |
| URL that starts with <code class="docutils literal notranslate"><span class="pre">'https'</span></code> you might try removing the <code class="docutils literal notranslate"><span class="pre">'s'</span></code>.</li> |
| <li><strong>match</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em> or </em><em>compiled regular expression</em><em>, </em><em>optional</em>) – The set of tables containing text matching this regex or string will be |
| returned. Unless the HTML is extremely simple you will probably need to |
| pass a non-empty string here. Defaults to ‘.+’ (match any non-empty |
| string). The default value will return all tables contained on a page. |
| This value is converted to a regular expression so that there is |
| consistent behavior between Beautiful Soup and lxml.</li> |
| <li><strong>flavor</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em>) – The parsing engine to use. ‘bs4’ and ‘html5lib’ are synonymous with |
| each other, they are both there for backwards compatibility. The |
| default of <code class="docutils literal notranslate"><span class="pre">None</span></code> tries to use <code class="docutils literal notranslate"><span class="pre">lxml</span></code> to parse and if that fails it |
| falls back on <code class="docutils literal notranslate"><span class="pre">bs4</span></code> + <code class="docutils literal notranslate"><span class="pre">html5lib</span></code>.</li> |
| <li><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em> or </em><em>list-like</em><em>, </em><em>optional</em>) – The row (or list of rows for a <a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.MultiIndex.html#pandas.MultiIndex" title="(in pandas v3.0.0.dev0+802.g7c836ed2ec)"><code class="xref py py-class docutils literal notranslate"><span class="pre">MultiIndex</span></code></a>) to use to |
| make the columns headers.</li> |
| <li><strong>index_col</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em> or </em><em>list-like</em><em>, </em><em>optional</em>) – The column (or list of columns) to use to create the index.</li> |
| <li><strong>skiprows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>list-like</em><em> or </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#slice" title="(in Python v3.12)"><em>slice</em></a><em>, </em><em>optional</em>) – Number of rows to skip after parsing the column integer. 0-based. If a |
| sequence of integers or a slice is given, will skip the rows indexed by |
| that sequence. Note that a single element sequence means ‘skip the nth |
| row’ whereas an integer means ‘skip n rows’.</li> |
| <li><strong>attrs</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>This is a dictionary of attributes that you can pass to use to identify |
| the table in the HTML. These are not checked for validity before being |
| passed to lxml or Beautiful Soup. However, these attributes must be |
| valid HTML table attributes to work correctly. For example,</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">attrs</span> <span class="o">=</span> <span class="p">{</span><span class="s1">'id'</span><span class="p">:</span> <span class="s1">'table'</span><span class="p">}</span> |
| </pre></div> |
| </div> |
| <p>is a valid attribute dictionary because the ‘id’ HTML tag attribute is |
| a valid HTML attribute for <em>any</em> HTML tag as per <a class="reference external" href="https://html.spec.whatwg.org/multipage/dom.html#global-attributes">this document</a>.</p> |
| <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">attrs</span> <span class="o">=</span> <span class="p">{</span><span class="s1">'asdf'</span><span class="p">:</span> <span class="s1">'table'</span><span class="p">}</span> |
| </pre></div> |
| </div> |
| <p>is <em>not</em> a valid attribute dictionary because ‘asdf’ is not a valid |
| HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 |
| table attributes can be found <a class="reference external" href="http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2">here</a>. A |
| working draft of the HTML 5 spec can be found <a class="reference external" href="https://html.spec.whatwg.org/multipage/tables.html">here</a>. It contains the |
| latest information on table attributes for the modern web.</p> |
| </li> |
| <li><strong>parse_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>optional</em>) – See <a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a> for more details.</li> |
| <li><strong>thousands</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em>) – Separator to use to parse thousands. Defaults to <code class="docutils literal notranslate"><span class="pre">','</span></code>.</li> |
| <li><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em>) – The encoding used to decode the web page. Defaults to <code class="docutils literal notranslate"><span class="pre">None</span></code>.``None`` |
| preserves the previous encoding behavior, which depends on the |
| underlying parser library (e.g., the parser library will try to use |
| the encoding provided by the document).</li> |
| <li><strong>decimal</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default '.'</em>) – Character to recognize as decimal point (e.g. use ‘,’ for European |
| data).</li> |
| <li><strong>converters</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>default None</em>) – Dict of functions for converting values in certain columns. Keys can |
| either be integers or column labels, values are functions that take one |
| input argument, the cell (not column) content, and return the |
| transformed content.</li> |
| <li><strong>na_values</strong> (<em>iterable</em><em>, </em><em>default None</em>) – Custom NA values.</li> |
| <li><strong>keep_default_na</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – If na_values are specified and keep_default_na is False the default NaN |
| values are overridden, otherwise they’re appended to.</li> |
| <li><strong>displayed_only</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – Whether elements with “display: none” should be parsed.</li> |
| <li><strong>extract_links</strong> (<em>{None</em><em>, </em><em>"all"</em><em>, </em><em>"header"</em><em>, </em><em>"body"</em><em>, </em><em>"footer"}</em>) – <p>Table elements in the specified section(s) with <a> tags will have their |
| href extracted.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.5.0.</span></p> |
| </div> |
| </li> |
| <li><strong>dtype_backend</strong> (<em>{"numpy_nullable"</em><em>, </em><em>"pyarrow"}</em><em>, </em><em>defaults to NumPy backed DeferredDataFrames</em>) – <p>Which dtype_backend to use, e.g. whether a DeferredDataFrame should have NumPy |
| arrays, nullable dtypes are used for all dtypes that have a nullable |
| implementation when “numpy_nullable” is set, pyarrow is used for all |
| dtypes if “pyarrow” is set.</p> |
| <p>The dtype_backends are still experimential.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 2.0.</span></p> |
| </div> |
| </li> |
| </ul> |
| </td> |
| </tr> |
| <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">A list of DeferredDataFrames.</p> |
| </td> |
| </tr> |
| <tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last">dfs</p> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="first admonition-title">See also</p> |
| <dl class="last docutils"> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a></dt> |
| <dd>Read a comma-separated values (csv) file into DeferredDataFrame.</dd> |
| </dl> |
| </div> |
| <p class="rubric">Notes</p> |
| <p>Before using this function you should read the <a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#io-html-gotchas" title="(in pandas v3.0.0.dev0+802.g7c836ed2ec)"><span class="xref std std-ref">gotchas about the |
| HTML parsing libraries</span></a>.</p> |
| <p>Expect to do some cleanup after you call this function. For example, you |
| might need to manually assign column names if the column names are |
| converted to NaN when you pass the <cite>header=0</cite> argument. We try to assume as |
| little as possible about the structure of the table and push the |
| idiosyncrasies of the HTML contained in the table to the user.</p> |
| <p>This function searches for <code class="docutils literal notranslate"><span class="pre"><table></span></code> elements and only for <code class="docutils literal notranslate"><span class="pre"><tr></span></code> |
| and <code class="docutils literal notranslate"><span class="pre"><th></span></code> rows and <code class="docutils literal notranslate"><span class="pre"><td></span></code> elements within each <code class="docutils literal notranslate"><span class="pre"><tr></span></code> or <code class="docutils literal notranslate"><span class="pre"><th></span></code> |
| element in the table. <code class="docutils literal notranslate"><span class="pre"><td></span></code> stands for “table data”. This function |
| attempts to properly handle <code class="docutils literal notranslate"><span class="pre">colspan</span></code> and <code class="docutils literal notranslate"><span class="pre">rowspan</span></code> attributes. |
| If the function has a <code class="docutils literal notranslate"><span class="pre"><thead></span></code> argument, it is used to construct |
| the header, otherwise the function attempts to find the header within |
| the body (by putting rows with only <code class="docutils literal notranslate"><span class="pre"><th></span></code> elements into the header).</p> |
| <p>Similar to <a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a> the <cite>header</cite> argument is applied |
| <strong>after</strong> <cite>skiprows</cite> is applied.</p> |
| <p>This function will <em>always</em> return a list of <code class="xref py py-class docutils literal notranslate"><span class="pre">DeferredDataFrame</span></code> <em>or</em> |
| it will fail, e.g., it will <em>not</em> return an empty list.</p> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="go">See the :ref:`read_html documentation in the IO section of the docs</span> |
| <span class="go"><io.read_html>` for some examples of reading in HTML tables.</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="function"> |
| <dt id="apache_beam.dataframe.io.to_html"> |
| <code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">to_html</code><span class="sig-paren">(</span><em>df</em>, <em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#to_html"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.to_html" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Render a DataFrame as an HTML table.</p> |
| <table class="docutils field-list" frame="void" rules="none"> |
| <col class="field-name" /> |
| <col class="field-body" /> |
| <tbody valign="top"> |
| <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple"> |
| <li><strong>buf</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>Path</em><em> or </em><em>StringIO-like</em><em>, </em><em>optional</em><em>, </em><em>default None</em>) – Buffer to write to. If None, the output is returned as a string.</li> |
| <li><strong>columns</strong> (<em>sequence</em><em>, </em><em>optional</em><em>, </em><em>default None</em>) – The subset of columns to write. Writes all columns by default.</li> |
| <li><strong>col_space</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.12)"><em>list</em></a><em> or </em><em>dict of int</em><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em>) – The minimum width of each column in CSS length units. An int is assumed to be px units..</li> |
| <li><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>optional</em>) – Whether to print column labels, default True.</li> |
| <li><strong>index</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>optional</em><em>, </em><em>default True</em>) – Whether to print index (row) labels.</li> |
| <li><strong>na_rep</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em><em>, </em><em>default 'NaN'</em>) – String representation of <code class="docutils literal notranslate"><span class="pre">NaN</span></code> to use.</li> |
| <li><strong>formatters</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.12)"><em>list</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.12)"><em>tuple</em></a><em> or </em><em>dict of one-param. functions</em><em>, </em><em>optional</em>) – Formatter functions to apply to columns’ elements by position or |
| name. |
| The result of each function must be a unicode string. |
| List/tuple must be of length equal to the number of columns.</li> |
| <li><strong>float_format</strong> (<em>one-parameter function</em><em>, </em><em>optional</em><em>, </em><em>default None</em>) – <p>Formatter function to apply to columns’ elements if they are |
| floats. This function must return a unicode string and will be |
| applied only to the non-<code class="docutils literal notranslate"><span class="pre">NaN</span></code> elements, with <code class="docutils literal notranslate"><span class="pre">NaN</span></code> being |
| handled by <code class="docutils literal notranslate"><span class="pre">na_rep</span></code>.</p> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.2.0.</span></p> |
| </div> |
| </li> |
| <li><strong>sparsify</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>optional</em><em>, </em><em>default True</em>) – Set to False for a DeferredDataFrame with a hierarchical index to print |
| every multiindex key at each row.</li> |
| <li><strong>index_names</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>optional</em><em>, </em><em>default True</em>) – Prints the names of the indexes.</li> |
| <li><strong>justify</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default None</em>) – <p>How to justify the column labels. If None uses the option from |
| the print configuration (controlled by set_option), ‘right’ out |
| of the box. Valid values are</p> |
| <ul> |
| <li>left</li> |
| <li>right</li> |
| <li>center</li> |
| <li>justify</li> |
| <li>justify-all</li> |
| <li>start</li> |
| <li>end</li> |
| <li>inherit</li> |
| <li>match-parent</li> |
| <li>initial</li> |
| <li>unset.</li> |
| </ul> |
| </li> |
| <li><strong>max_rows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>optional</em>) – Maximum number of rows to display in the console.</li> |
| <li><strong>max_cols</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>optional</em>) – Maximum number of columns to display in the console.</li> |
| <li><strong>show_dimensions</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default False</em>) – Display DeferredDataFrame dimensions (number of rows by number of columns).</li> |
| <li><strong>decimal</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default '.'</em>) – Character recognized as decimal separator, e.g. ‘,’ in Europe.</li> |
| <li><strong>bold_rows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – Make the row labels bold in the output.</li> |
| <li><strong>classes</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.12)"><em>list</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#tuple" title="(in Python v3.12)"><em>tuple</em></a><em>, </em><em>default None</em>) – CSS class(es) to apply to the resulting html table.</li> |
| <li><strong>escape</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – Convert the characters <, >, and & to HTML-safe sequences.</li> |
| <li><strong>notebook</strong> (<em>{True</em><em>, </em><em>False}</em><em>, </em><em>default False</em>) – Whether the generated HTML is for IPython Notebook.</li> |
| <li><strong>border</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a>) – A <code class="docutils literal notranslate"><span class="pre">border=border</span></code> attribute is included in the opening |
| <cite><table></cite> tag. Default <code class="docutils literal notranslate"><span class="pre">pd.options.display.html.border</span></code>.</li> |
| <li><strong>table_id</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em>) – A css id is included in the opening <cite><table></cite> tag if specified.</li> |
| <li><strong>render_links</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default False</em>) – Convert URLs to HTML links.</li> |
| <li><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default "utf-8"</em>) – <p>Set character encoding.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.0.</span></p> |
| </div> |
| </li> |
| </ul> |
| </td> |
| </tr> |
| <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">If buf is None, returns the result as a string. Otherwise returns |
| None.</p> |
| </td> |
| </tr> |
| <tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)">str</a> or <a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.12)">None</a></p> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="first admonition-title">See also</p> |
| <dl class="last docutils"> |
| <dt><code class="xref py py-func docutils literal notranslate"><span class="pre">to_string()</span></code></dt> |
| <dd>Convert DeferredDataFrame to a string.</dd> |
| </dl> |
| </div> |
| </dd></dl> |
| |
| <dl class="class"> |
| <dt id="apache_beam.dataframe.io.ReadViaPandas"> |
| <em class="property">class </em><code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">ReadViaPandas</code><span class="sig-paren">(</span><em>format</em>, <em>*args</em>, <em>include_indexes=False</em>, <em>objects_as_strings=True</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#ReadViaPandas"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.ReadViaPandas" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Bases: <a class="reference internal" href="apache_beam.transforms.ptransform.html#apache_beam.transforms.ptransform.PTransform" title="apache_beam.transforms.ptransform.PTransform"><code class="xref py py-class docutils literal notranslate"><span class="pre">apache_beam.transforms.ptransform.PTransform</span></code></a></p> |
| <dl class="method"> |
| <dt id="apache_beam.dataframe.io.ReadViaPandas.expand"> |
| <code class="descname">expand</code><span class="sig-paren">(</span><em>p</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#ReadViaPandas.expand"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.ReadViaPandas.expand" title="Permalink to this definition">¶</a></dt> |
| <dd></dd></dl> |
| |
| </dd></dl> |
| |
| <dl class="class"> |
| <dt id="apache_beam.dataframe.io.WriteViaPandas"> |
| <em class="property">class </em><code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">WriteViaPandas</code><span class="sig-paren">(</span><em>format</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#WriteViaPandas"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.WriteViaPandas" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Bases: <a class="reference internal" href="apache_beam.transforms.ptransform.html#apache_beam.transforms.ptransform.PTransform" title="apache_beam.transforms.ptransform.PTransform"><code class="xref py py-class docutils literal notranslate"><span class="pre">apache_beam.transforms.ptransform.PTransform</span></code></a></p> |
| <dl class="method"> |
| <dt id="apache_beam.dataframe.io.WriteViaPandas.expand"> |
| <code class="descname">expand</code><span class="sig-paren">(</span><em>pcoll</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/dataframe/io.html#WriteViaPandas.expand"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.dataframe.io.WriteViaPandas.expand" title="Permalink to this definition">¶</a></dt> |
| <dd></dd></dl> |
| |
| </dd></dl> |
| |
| <dl class="function"> |
| <dt id="apache_beam.dataframe.io.read_excel"> |
| <code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_excel</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.read_excel" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Read an Excel file into a pandas DataFrame.</p> |
| <p>Supports <cite>xls</cite>, <cite>xlsx</cite>, <cite>xlsm</cite>, <cite>xlsb</cite>, <cite>odf</cite>, <cite>ods</cite> and <cite>odt</cite> file extensions |
| read from a local filesystem or URL. Supports an option to read |
| a single sheet or a list of sheets.</p> |
| <table class="docutils field-list" frame="void" rules="none"> |
| <col class="field-name" /> |
| <col class="field-body" /> |
| <tbody valign="top"> |
| <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple"> |
| <li><strong>io</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#bytes" title="(in Python v3.12)"><em>bytes</em></a><em>, </em><em>ExcelFile</em><em>, </em><em>xlrd.Book</em><em>, </em><em>path object</em><em>, or </em><em>file-like object</em>) – <p>Any valid string path is acceptable. The string could be a URL. Valid |
| URL schemes include http, ftp, s3, and file. For file URLs, a host is |
| expected. A local file could be: <code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.xlsx</span></code>.</p> |
| <p>If you want to pass in a path object, pandas accepts any <code class="docutils literal notranslate"><span class="pre">os.PathLike</span></code>.</p> |
| <p>By file-like object, we refer to objects with a <code class="docutils literal notranslate"><span class="pre">read()</span></code> method, |
| such as a file handle (e.g. via builtin <code class="docutils literal notranslate"><span class="pre">open</span></code> function) |
| or <code class="docutils literal notranslate"><span class="pre">StringIO</span></code>.</p> |
| </li> |
| <li><strong>sheet_name</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.12)"><em>list</em></a><em>, or </em><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.12)"><em>None</em></a><em>, </em><em>default 0</em>) – <p>Strings are used for sheet names. Integers are used in zero-indexed |
| sheet positions (chart sheets do not count as a sheet position). |
| Lists of strings/integers are used to request multiple sheets. |
| Specify None to get all worksheets.</p> |
| <p>Available cases:</p> |
| <ul> |
| <li>Defaults to <code class="docutils literal notranslate"><span class="pre">0</span></code>: 1st sheet as a <cite>DeferredDataFrame</cite></li> |
| <li><code class="docutils literal notranslate"><span class="pre">1</span></code>: 2nd sheet as a <cite>DeferredDataFrame</cite></li> |
| <li><code class="docutils literal notranslate"><span class="pre">"Sheet1"</span></code>: Load sheet with name “Sheet1”</li> |
| <li><code class="docutils literal notranslate"><span class="pre">[0,</span> <span class="pre">1,</span> <span class="pre">"Sheet5"]</span></code>: Load first, second and sheet named “Sheet5” |
| as a dict of <cite>DeferredDataFrame</cite></li> |
| <li>None: All worksheets.</li> |
| </ul> |
| </li> |
| <li><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>list of int</em><em>, </em><em>default 0</em>) – Row (0-indexed) to use for the column labels of the parsed |
| DeferredDataFrame. If a list of integers is passed those row positions will |
| be combined into a <code class="docutils literal notranslate"><span class="pre">MultiIndex</span></code>. Use None if there is no header.</li> |
| <li><strong>names</strong> (<em>array-like</em><em>, </em><em>default None</em>) – List of column names to use. If file contains no header row, |
| then you should explicitly pass header=None.</li> |
| <li><strong>index_col</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>list of int</em><em>, </em><em>default None</em>) – <p>Column (0-indexed) to use as the row labels of the DeferredDataFrame. |
| Pass None if there is no such column. If a list is passed, |
| those columns will be combined into a <code class="docutils literal notranslate"><span class="pre">MultiIndex</span></code>. If a |
| subset of data is selected with <code class="docutils literal notranslate"><span class="pre">usecols</span></code>, index_col |
| is based on the subset.</p> |
| <p>Missing values will be forward filled to allow roundtripping with |
| <code class="docutils literal notranslate"><span class="pre">to_excel</span></code> for <code class="docutils literal notranslate"><span class="pre">merged_cells=True</span></code>. To avoid forward filling the |
| missing values use <code class="docutils literal notranslate"><span class="pre">set_index</span></code> after reading the data instead of |
| <code class="docutils literal notranslate"><span class="pre">index_col</span></code>.</p> |
| </li> |
| <li><strong>usecols</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>list-like</em><em>, or </em><em>callable</em><em>, </em><em>default None</em>) – <ul> |
| <li>If None, then parse all columns.</li> |
| <li>If str, then indicates comma separated list of Excel column letters |
| and column ranges (e.g. “A:E” or “A,C,E:F”). Ranges are inclusive of |
| both sides.</li> |
| <li>If list of int, then indicates list of column numbers to be parsed |
| (0-indexed).</li> |
| <li>If list of string, then indicates list of column names to be parsed.</li> |
| <li>If callable, then evaluate each column name against it and parse the |
| column if the callable returns <code class="docutils literal notranslate"><span class="pre">True</span></code>.</li> |
| </ul> |
| <p>Returns a subset of the columns according to behavior above.</p> |
| </li> |
| <li><strong>dtype</strong> (<em>Type name</em><em> or </em><em>dict of column -> type</em><em>, </em><em>default None</em>) – Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32} |
| Use <cite>object</cite> to preserve data as stored in Excel and not interpret dtype. |
| If converters are specified, they will be applied INSTEAD |
| of dtype conversion.</li> |
| <li><strong>engine</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default None</em>) – <p>If io is not a buffer or path, this must be set to identify io. |
| Supported engines: “xlrd”, “openpyxl”, “odf”, “pyxlsb”. |
| Engine compatibility :</p> |
| <ul> |
| <li>”xlrd” supports old-style Excel files (.xls).</li> |
| <li>”openpyxl” supports newer Excel file formats.</li> |
| <li>”odf” supports OpenDocument file formats (.odf, .ods, .odt).</li> |
| <li>”pyxlsb” supports Binary Excel files.</li> |
| </ul> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.2.0: </span>The engine <a class="reference external" href="https://xlrd.readthedocs.io/en/latest/">xlrd</a> |
| now only supports old-style <code class="docutils literal notranslate"><span class="pre">.xls</span></code> files. |
| When <code class="docutils literal notranslate"><span class="pre">engine=None</span></code>, the following logic will be |
| used to determine the engine:</p> |
| <ul> |
| <li>If <code class="docutils literal notranslate"><span class="pre">path_or_buffer</span></code> is an OpenDocument format (.odf, .ods, .odt), |
| then <a class="reference external" href="https://pypi.org/project/odfpy/">odf</a> will be used.</li> |
| <li>Otherwise if <code class="docutils literal notranslate"><span class="pre">path_or_buffer</span></code> is an xls format, |
| <code class="docutils literal notranslate"><span class="pre">xlrd</span></code> will be used.</li> |
| <li>Otherwise if <code class="docutils literal notranslate"><span class="pre">path_or_buffer</span></code> is in xlsb format, |
| <code class="docutils literal notranslate"><span class="pre">pyxlsb</span></code> will be used.<div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.3.0.</span></p> |
| </div> |
| </li> |
| <li>Otherwise <code class="docutils literal notranslate"><span class="pre">openpyxl</span></code> will be used.<div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.3.0.</span></p> |
| </div> |
| </li> |
| </ul> |
| </div> |
| </li> |
| <li><strong>converters</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>default None</em>) – Dict of functions for converting values in certain columns. Keys can |
| either be integers or column labels, values are functions that take one |
| input argument, the Excel cell content, and return the transformed |
| content.</li> |
| <li><strong>true_values</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.12)"><em>list</em></a><em>, </em><em>default None</em>) – Values to consider as True.</li> |
| <li><strong>false_values</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.12)"><em>list</em></a><em>, </em><em>default None</em>) – Values to consider as False.</li> |
| <li><strong>skiprows</strong> (<em>list-like</em><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, or </em><em>callable</em><em>, </em><em>optional</em>) – Line numbers to skip (0-indexed) or number of lines to skip (int) at the |
| start of the file. If callable, the callable function will be evaluated |
| against the row indices, returning True if the row should be skipped and |
| False otherwise. An example of a valid callable argument would be <code class="docutils literal notranslate"><span class="pre">lambda</span> |
| <span class="pre">x:</span> <span class="pre">x</span> <span class="pre">in</span> <span class="pre">[0,</span> <span class="pre">2]</span></code>.</li> |
| <li><strong>nrows</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>default None</em>) – Number of rows to parse.</li> |
| <li><strong>na_values</strong> (<em>scalar</em><em>, </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>list-like</em><em>, or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>default None</em>) – Additional strings to recognize as NA/NaN. If dict passed, specific |
| per-column NA values. By default the following values are interpreted |
| as NaN: ‘’, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, ‘-nan’, |
| ‘1.#IND’, ‘1.#QNAN’, ‘<NA>’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘None’, |
| ‘n/a’, ‘nan’, ‘null’.</li> |
| <li><strong>keep_default_na</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – <p>Whether or not to include the default NaN values when parsing the data. |
| Depending on whether <cite>na_values</cite> is passed in, the behavior is as follows:</p> |
| <ul> |
| <li>If <cite>keep_default_na</cite> is True, and <cite>na_values</cite> are specified, <cite>na_values</cite> |
| is appended to the default NaN values used for parsing.</li> |
| <li>If <cite>keep_default_na</cite> is True, and <cite>na_values</cite> are not specified, only |
| the default NaN values are used for parsing.</li> |
| <li>If <cite>keep_default_na</cite> is False, and <cite>na_values</cite> are specified, only |
| the NaN values specified <cite>na_values</cite> are used for parsing.</li> |
| <li>If <cite>keep_default_na</cite> is False, and <cite>na_values</cite> are not specified, no |
| strings will be parsed as NaN.</li> |
| </ul> |
| <p>Note that if <cite>na_filter</cite> is passed in as False, the <cite>keep_default_na</cite> and |
| <cite>na_values</cite> parameters will be ignored.</p> |
| </li> |
| <li><strong>na_filter</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – Detect missing value markers (empty strings and the value of na_values). In |
| data without any NAs, passing na_filter=False can improve the performance |
| of reading a large file.</li> |
| <li><strong>verbose</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default False</em>) – Indicate number of NA values placed in non-numeric columns.</li> |
| <li><strong>parse_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>list-like</em><em>, or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>default False</em>) – <p>The behavior is as follows:</p> |
| <ul> |
| <li>bool. If True -> try parsing the index.</li> |
| <li>list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 |
| each as a separate date column.</li> |
| <li>list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as |
| a single date column.</li> |
| <li>dict, e.g. {‘foo’ : [1, 3]} -> parse columns 1, 3 as date and call |
| result ‘foo’</li> |
| </ul> |
| <p>If a column or index contains an unparsable date, the entire column or |
| index will be returned unaltered as an object data type. If you don`t want to |
| parse some cells as date just change their type in Excel to “Text”. |
| For non-standard datetime parsing, use <code class="docutils literal notranslate"><span class="pre">pd.to_datetime</span></code> after <code class="docutils literal notranslate"><span class="pre">pd.read_excel</span></code>.</p> |
| <p>Note: A fast-path exists for iso8601-formatted dates.</p> |
| </li> |
| <li><strong>date_parser</strong> (<em>function</em><em>, </em><em>optional</em>) – <p>Function to use for converting a sequence of string columns to an array of |
| datetime instances. The default uses <code class="docutils literal notranslate"><span class="pre">dateutil.parser.parser</span></code> to do the |
| conversion. Pandas will try to call <cite>date_parser</cite> in three different ways, |
| advancing to the next if an exception occurs: 1) Pass one or more arrays |
| (as defined by <cite>parse_dates</cite>) as arguments; 2) concatenate (row-wise) the |
| string values from the columns defined by <cite>parse_dates</cite> into a single array |
| and pass that; and 3) call <cite>date_parser</cite> once for each row using one or |
| more strings (corresponding to the columns defined by <cite>parse_dates</cite>) as |
| arguments.</p> |
| <div class="deprecated"> |
| <p><span class="versionmodified">Deprecated since version 2.0.0: </span>Use <code class="docutils literal notranslate"><span class="pre">date_format</span></code> instead, or read in as <code class="docutils literal notranslate"><span class="pre">object</span></code> and then apply |
| <code class="xref py py-func docutils literal notranslate"><span class="pre">to_datetime()</span></code> as-needed.</p> |
| </div> |
| </li> |
| <li><strong>date_format</strong> (str or dict of column -> format, default <code class="docutils literal notranslate"><span class="pre">None</span></code>) – <p>If used in conjunction with <code class="docutils literal notranslate"><span class="pre">parse_dates</span></code>, will parse dates according to this |
| format. For anything more complex, |
| please read in as <code class="docutils literal notranslate"><span class="pre">object</span></code> and then apply <code class="xref py py-func docutils literal notranslate"><span class="pre">to_datetime()</span></code> as-needed.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 2.0.0.</span></p> |
| </div> |
| </li> |
| <li><strong>thousands</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default None</em>) – Thousands separator for parsing string columns to numeric. Note that |
| this parameter is only necessary for columns stored as TEXT in Excel, |
| any numeric columns will automatically be parsed, regardless of display |
| format.</li> |
| <li><strong>decimal</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default '.'</em>) – <p>Character to recognize as decimal point for parsing string columns to numeric. |
| Note that this parameter is only necessary for columns stored as TEXT in Excel, |
| any numeric columns will automatically be parsed, regardless of display |
| format.(e.g. use ‘,’ for European data).</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.4.0.</span></p> |
| </div> |
| </li> |
| <li><strong>comment</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default None</em>) – Comments out remainder of line. Pass a character or characters to this |
| argument to indicate comments in the input file. Any data between the |
| comment string and the end of the current line is ignored.</li> |
| <li><strong>skipfooter</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>default 0</em>) – Rows at the end to skip (0-indexed).</li> |
| <li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.2.0.</span></p> |
| </div> |
| </li> |
| <li><strong>dtype_backend</strong> (<em>{"numpy_nullable"</em><em>, </em><em>"pyarrow"}</em><em>, </em><em>defaults to NumPy backed DeferredDataFrames</em>) – <p>Which dtype_backend to use, e.g. whether a DeferredDataFrame should have NumPy |
| arrays, nullable dtypes are used for all dtypes that have a nullable |
| implementation when “numpy_nullable” is set, pyarrow is used for all |
| dtypes if “pyarrow” is set.</p> |
| <p>The dtype_backends are still experimential.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 2.0.</span></p> |
| </div> |
| </li> |
| </ul> |
| </td> |
| </tr> |
| <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first">DeferredDataFrame from the passed in Excel file. See notes in sheet_name |
| argument for more information on when a dict of DeferredDataFrames is returned.</p> |
| </td> |
| </tr> |
| <tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a> or dict of DeferredDataFrames</p> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="first admonition-title">See also</p> |
| <dl class="last docutils"> |
| <dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_excel()</span></code></dt> |
| <dd>Write DeferredDataFrame to an Excel file.</dd> |
| <dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_csv()</span></code></dt> |
| <dd>Write DeferredDataFrame to a comma-separated values (csv) file.</dd> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a></dt> |
| <dd>Read a comma-separated values (csv) file into DeferredDataFrame.</dd> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_fwf" title="apache_beam.dataframe.io.read_fwf"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_fwf()</span></code></a></dt> |
| <dd>Read a table of fixed-width formatted lines into DeferredDataFrame.</dd> |
| </dl> |
| </div> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="go">The file can be read using the file name as string or an open file object:</span> |
| |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="s1">'tmp.xlsx'</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span> |
| <span class="go"> Name Value</span> |
| <span class="go">0 string1 1</span> |
| <span class="go">1 string2 2</span> |
| <span class="go">2 #Comment 3</span> |
| |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="nb">open</span><span class="p">(</span><span class="s1">'tmp.xlsx'</span><span class="p">,</span> <span class="s1">'rb'</span><span class="p">),</span> |
| <span class="gp">... </span> <span class="n">sheet_name</span><span class="o">=</span><span class="s1">'Sheet3'</span><span class="p">)</span> |
| <span class="go"> Unnamed: 0 Name Value</span> |
| <span class="go">0 0 string1 1</span> |
| <span class="go">1 1 string2 2</span> |
| <span class="go">2 2 #Comment 3</span> |
| |
| <span class="go">Index and header can be specified via the `index_col` and `header` arguments</span> |
| |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="s1">'tmp.xlsx'</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">header</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span> |
| <span class="go"> 0 1 2</span> |
| <span class="go">0 NaN Name Value</span> |
| <span class="go">1 0.0 string1 1</span> |
| <span class="go">2 1.0 string2 2</span> |
| <span class="go">3 2.0 #Comment 3</span> |
| |
| <span class="go">Column types are inferred but can be explicitly specified</span> |
| |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="s1">'tmp.xlsx'</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> |
| <span class="gp">... </span> <span class="n">dtype</span><span class="o">=</span><span class="p">{</span><span class="s1">'Name'</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="s1">'Value'</span><span class="p">:</span> <span class="nb">float</span><span class="p">})</span> |
| <span class="go"> Name Value</span> |
| <span class="go">0 string1 1.0</span> |
| <span class="go">1 string2 2.0</span> |
| <span class="go">2 #Comment 3.0</span> |
| |
| <span class="go">True, False, and NA values, and thousands separators have defaults,</span> |
| <span class="go">but can be explicitly specified, too. Supply the values you would like</span> |
| <span class="go">as strings or lists of strings!</span> |
| |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="s1">'tmp.xlsx'</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> |
| <span class="gp">... </span> <span class="n">na_values</span><span class="o">=</span><span class="p">[</span><span class="s1">'string1'</span><span class="p">,</span> <span class="s1">'string2'</span><span class="p">])</span> |
| <span class="go"> Name Value</span> |
| <span class="go">0 NaN 1</span> |
| <span class="go">1 NaN 2</span> |
| <span class="go">2 #Comment 3</span> |
| |
| <span class="go">Comment lines in the excel input file can be skipped using the `comment` kwarg</span> |
| |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_excel</span><span class="p">(</span><span class="s1">'tmp.xlsx'</span><span class="p">,</span> <span class="n">index_col</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">comment</span><span class="o">=</span><span class="s1">'#'</span><span class="p">)</span> |
| <span class="go"> Name Value</span> |
| <span class="go">0 string1 1.0</span> |
| <span class="go">1 string2 2.0</span> |
| <span class="go">2 None NaN</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="function"> |
| <dt id="apache_beam.dataframe.io.read_feather"> |
| <code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_feather</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.read_feather" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Load a feather-format object from the file path.</p> |
| <table class="docutils field-list" frame="void" rules="none"> |
| <col class="field-name" /> |
| <col class="field-body" /> |
| <tbody valign="top"> |
| <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple"> |
| <li><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>path object</em><em>, or </em><em>file-like object</em>) – String, path object (implementing <code class="docutils literal notranslate"><span class="pre">os.PathLike[str]</span></code>), or file-like |
| object implementing a binary <code class="docutils literal notranslate"><span class="pre">read()</span></code> function. The string could be a URL. |
| Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is |
| expected. A local file could be: <code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.feather</span></code>.</li> |
| <li><strong>columns</strong> (<em>sequence</em><em>, </em><em>default None</em>) – If not provided, all columns are read.</li> |
| <li><strong>use_threads</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – Whether to parallelize reading using multiple threads.</li> |
| <li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.2.0.</span></p> |
| </div> |
| </li> |
| <li><strong>dtype_backend</strong> (<em>{"numpy_nullable"</em><em>, </em><em>"pyarrow"}</em><em>, </em><em>defaults to NumPy backed DeferredDataFrames</em>) – <p>Which dtype_backend to use, e.g. whether a DeferredDataFrame should have NumPy |
| arrays, nullable dtypes are used for all dtypes that have a nullable |
| implementation when “numpy_nullable” is set, pyarrow is used for all |
| dtypes if “pyarrow” is set.</p> |
| <p>The dtype_backends are still experimential.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 2.0.</span></p> |
| </div> |
| </li> |
| </ul> |
| </td> |
| </tr> |
| <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first"></p> |
| </td> |
| </tr> |
| <tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last">type of object stored in file</p> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| </dd></dl> |
| |
| <dl class="function"> |
| <dt id="apache_beam.dataframe.io.read_parquet"> |
| <code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_parquet</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.read_parquet" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Load a parquet object from the file path, returning a DataFrame.</p> |
| <table class="docutils field-list" frame="void" rules="none"> |
| <col class="field-name" /> |
| <col class="field-body" /> |
| <tbody valign="top"> |
| <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple"> |
| <li><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>path object</em><em> or </em><em>file-like object</em>) – String, path object (implementing <code class="docutils literal notranslate"><span class="pre">os.PathLike[str]</span></code>), or file-like |
| object implementing a binary <code class="docutils literal notranslate"><span class="pre">read()</span></code> function. |
| The string could be a URL. Valid URL schemes include http, ftp, s3, |
| gs, and file. For file URLs, a host is expected. A local file could be: |
| <code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.parquet</span></code>. |
| A file URL can also be a path to a directory that contains multiple |
| partitioned parquet files. Both pyarrow and fastparquet support |
| paths to directories as well as file URLs. A directory path could be: |
| <code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/tables</span></code> or <code class="docutils literal notranslate"><span class="pre">s3://bucket/partition_dir</span></code>.</li> |
| <li><strong>engine</strong> (<em>{'auto'</em><em>, </em><em>'pyarrow'</em><em>, </em><em>'fastparquet'}</em><em>, </em><em>default 'auto'</em>) – Parquet library to use. If ‘auto’, then the option |
| <code class="docutils literal notranslate"><span class="pre">io.parquet.engine</span></code> is used. The default <code class="docutils literal notranslate"><span class="pre">io.parquet.engine</span></code> |
| behavior is to try ‘pyarrow’, falling back to ‘fastparquet’ if |
| ‘pyarrow’ is unavailable.</li> |
| <li><strong>columns</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.12)"><em>list</em></a><em>, </em><em>default=None</em>) – If not None, only these columns will be read from the file.</li> |
| <li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.3.0.</span></p> |
| </div> |
| </li> |
| <li><strong>use_nullable_dtypes</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default False</em>) – <p>If True, use dtypes that use <code class="docutils literal notranslate"><span class="pre">pd.NA</span></code> as missing value indicator |
| for the resulting DeferredDataFrame. (only applicable for the <code class="docutils literal notranslate"><span class="pre">pyarrow</span></code> |
| engine) |
| As new dtypes are added that support <code class="docutils literal notranslate"><span class="pre">pd.NA</span></code> in the future, the |
| output with this option will change to use those dtypes. |
| Note: this is an experimental option, and behaviour (e.g. additional |
| support dtypes) may change without notice.</p> |
| <div class="deprecated"> |
| <p><span class="versionmodified">Deprecated since version 2.0.</span></p> |
| </div> |
| </li> |
| <li><strong>dtype_backend</strong> (<em>{"numpy_nullable"</em><em>, </em><em>"pyarrow"}</em><em>, </em><em>defaults to NumPy backed DeferredDataFrames</em>) – <p>Which dtype_backend to use, e.g. whether a DeferredDataFrame should have NumPy |
| arrays, nullable dtypes are used for all dtypes that have a nullable |
| implementation when “numpy_nullable” is set, pyarrow is used for all |
| dtypes if “pyarrow” is set.</p> |
| <p>The dtype_backends are still experimential.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 2.0.</span></p> |
| </div> |
| </li> |
| <li><strong>**kwargs</strong> – Any additional kwargs are passed to the engine.</li> |
| </ul> |
| </td> |
| </tr> |
| <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first"></p> |
| </td> |
| </tr> |
| <tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a></p> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| </dd></dl> |
| |
| <dl class="function"> |
| <dt id="apache_beam.dataframe.io.read_sas"> |
| <code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_sas</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.read_sas" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Read SAS files stored as either XPORT or SAS7BDAT format files.</p> |
| <table class="docutils field-list" frame="void" rules="none"> |
| <col class="field-name" /> |
| <col class="field-body" /> |
| <tbody valign="top"> |
| <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple"> |
| <li><strong>filepath_or_buffer</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>path object</em><em>, or </em><em>file-like object</em>) – String, path object (implementing <code class="docutils literal notranslate"><span class="pre">os.PathLike[str]</span></code>), or file-like |
| object implementing a binary <code class="docutils literal notranslate"><span class="pre">read()</span></code> function. The string could be a URL. |
| Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is |
| expected. A local file could be: |
| <code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.sas7bdat</span></code>.</li> |
| <li><strong>format</strong> (<em>str {'xport'</em><em>, </em><em>'sas7bdat'}</em><em> or </em><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.12)"><em>None</em></a>) – If None, file format is inferred from file extension. If ‘xport’ or |
| ‘sas7bdat’, uses the corresponding format.</li> |
| <li><strong>index</strong> (<em>identifier of index column</em><em>, </em><em>defaults to None</em>) – Identifier of column that should be used as index of the DeferredDataFrame.</li> |
| <li><strong>encoding</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default is None</em>) – Encoding for text data. If None, text data are stored as raw bytes.</li> |
| <li><strong>chunksize</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a>) – <p>Read file <cite>chunksize</cite> lines at a time, returns iterator.</p> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.2: </span><code class="docutils literal notranslate"><span class="pre">TextFileReader</span></code> is a context manager.</p> |
| </div> |
| </li> |
| <li><strong>iterator</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>defaults to False</em>) – <p>If True, returns an iterator for reading the file incrementally.</p> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.2: </span><code class="docutils literal notranslate"><span class="pre">TextFileReader</span></code> is a context manager.</p> |
| </div> |
| </li> |
| <li><strong>compression</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>default 'infer'</em>) – <p>For on-the-fly decompression of on-disk data. If ‘infer’ and ‘filepath_or_buffer’ is |
| path-like, then detect compression from the following extensions: ‘.gz’, |
| ‘.bz2’, ‘.zip’, ‘.xz’, ‘.zst’, ‘.tar’, ‘.tar.gz’, ‘.tar.xz’ or ‘.tar.bz2’ |
| (otherwise no compression). |
| If using ‘zip’ or ‘tar’, the ZIP file must contain only one data file to be read in. |
| Set to <code class="docutils literal notranslate"><span class="pre">None</span></code> for no decompression. |
| Can also be a dict with key <code class="docutils literal notranslate"><span class="pre">'method'</span></code> set |
| to one of {<code class="docutils literal notranslate"><span class="pre">'zip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'gzip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'bz2'</span></code>, <code class="docutils literal notranslate"><span class="pre">'zstd'</span></code>, <code class="docutils literal notranslate"><span class="pre">'tar'</span></code>} and other |
| key-value pairs are forwarded to |
| <code class="docutils literal notranslate"><span class="pre">zipfile.ZipFile</span></code>, <code class="docutils literal notranslate"><span class="pre">gzip.GzipFile</span></code>, |
| <code class="docutils literal notranslate"><span class="pre">bz2.BZ2File</span></code>, <code class="docutils literal notranslate"><span class="pre">zstandard.ZstdDecompressor</span></code> or |
| <code class="docutils literal notranslate"><span class="pre">tarfile.TarFile</span></code>, respectively. |
| As an example, the following could be passed for Zstandard decompression using a |
| custom compression dictionary: |
| <code class="docutils literal notranslate"><span class="pre">compression={'method':</span> <span class="pre">'zstd',</span> <span class="pre">'dict_data':</span> <span class="pre">my_compression_dict}</span></code>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.5.0: </span>Added support for <cite>.tar</cite> files.</p> |
| </div> |
| </li> |
| </ul> |
| </td> |
| </tr> |
| <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last"><ul class="simple"> |
| <li><em>DeferredDataFrame if iterator=False and chunksize=None, else SAS7BDATReader</em></li> |
| <li><em>or XportReader</em></li> |
| </ul> |
| </p> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| </dd></dl> |
| |
| <dl class="function"> |
| <dt id="apache_beam.dataframe.io.read_spss"> |
| <code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_spss</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.read_spss" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Load an SPSS file from the file path, returning a DataFrame.</p> |
| <table class="docutils field-list" frame="void" rules="none"> |
| <col class="field-name" /> |
| <col class="field-body" /> |
| <tbody valign="top"> |
| <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple"> |
| <li><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em> or </em><em>Path</em>) – File path.</li> |
| <li><strong>usecols</strong> (<em>list-like</em><em>, </em><em>optional</em>) – Return a subset of the columns. If None, return all columns.</li> |
| <li><strong>convert_categoricals</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default is True</em>) – Convert categorical columns into pd.Categorical.</li> |
| <li><strong>dtype_backend</strong> (<em>{"numpy_nullable"</em><em>, </em><em>"pyarrow"}</em><em>, </em><em>defaults to NumPy backed DeferredDataFrames</em>) – <p>Which dtype_backend to use, e.g. whether a DeferredDataFrame should have NumPy |
| arrays, nullable dtypes are used for all dtypes that have a nullable |
| implementation when “numpy_nullable” is set, pyarrow is used for all |
| dtypes if “pyarrow” is set.</p> |
| <p>The dtype_backends are still experimential.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 2.0.</span></p> |
| </div> |
| </li> |
| </ul> |
| </td> |
| </tr> |
| <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first"></p> |
| </td> |
| </tr> |
| <tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a></p> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| </dd></dl> |
| |
| <dl class="function"> |
| <dt id="apache_beam.dataframe.io.read_stata"> |
| <code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">read_stata</code><span class="sig-paren">(</span><em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.read_stata" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Read Stata file into DataFrame.</p> |
| <table class="docutils field-list" frame="void" rules="none"> |
| <col class="field-name" /> |
| <col class="field-body" /> |
| <tbody valign="top"> |
| <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple"> |
| <li><strong>filepath_or_buffer</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>path object</em><em> or </em><em>file-like object</em>) – <p>Any valid string path is acceptable. The string could be a URL. Valid |
| URL schemes include http, ftp, s3, and file. For file URLs, a host is |
| expected. A local file could be: <code class="docutils literal notranslate"><span class="pre">file://localhost/path/to/table.dta</span></code>.</p> |
| <p>If you want to pass in a path object, pandas accepts any <code class="docutils literal notranslate"><span class="pre">os.PathLike</span></code>.</p> |
| <p>By file-like object, we refer to objects with a <code class="docutils literal notranslate"><span class="pre">read()</span></code> method, |
| such as a file handle (e.g. via builtin <code class="docutils literal notranslate"><span class="pre">open</span></code> function) |
| or <code class="docutils literal notranslate"><span class="pre">StringIO</span></code>.</p> |
| </li> |
| <li><strong>convert_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – Convert date variables to DeferredDataFrame time values.</li> |
| <li><strong>convert_categoricals</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – Read value labels and convert columns to Categorical/Factor variables.</li> |
| <li><strong>index_col</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em>) – Column to set as index.</li> |
| <li><strong>convert_missing</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default False</em>) – Flag indicating whether to convert missing values to their Stata |
| representations. If False, missing values are replaced with nan. |
| If True, columns containing missing values are returned with |
| object data types and missing values are represented by |
| StataMissingValue objects.</li> |
| <li><strong>preserve_dtypes</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – Preserve Stata datatypes. If False, numeric data are upcast to pandas |
| default types for foreign data (float64 or int64).</li> |
| <li><strong>columns</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.12)"><em>list</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.12)"><em>None</em></a>) – Columns to retain. Columns will be returned in the given order. None |
| returns all columns.</li> |
| <li><strong>order_categoricals</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – Flag indicating whether converted categorical data are ordered.</li> |
| <li><strong>chunksize</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>default None</em>) – Return StataReader object for iterations, returns chunks with |
| given number of lines.</li> |
| <li><strong>iterator</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default False</em>) – Return StataReader object.</li> |
| <li><strong>compression</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>default 'infer'</em>) – <p>For on-the-fly decompression of on-disk data. If ‘infer’ and ‘filepath_or_buffer’ is |
| path-like, then detect compression from the following extensions: ‘.gz’, |
| ‘.bz2’, ‘.zip’, ‘.xz’, ‘.zst’, ‘.tar’, ‘.tar.gz’, ‘.tar.xz’ or ‘.tar.bz2’ |
| (otherwise no compression). |
| If using ‘zip’ or ‘tar’, the ZIP file must contain only one data file to be read in. |
| Set to <code class="docutils literal notranslate"><span class="pre">None</span></code> for no decompression. |
| Can also be a dict with key <code class="docutils literal notranslate"><span class="pre">'method'</span></code> set |
| to one of {<code class="docutils literal notranslate"><span class="pre">'zip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'gzip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'bz2'</span></code>, <code class="docutils literal notranslate"><span class="pre">'zstd'</span></code>, <code class="docutils literal notranslate"><span class="pre">'tar'</span></code>} and other |
| key-value pairs are forwarded to |
| <code class="docutils literal notranslate"><span class="pre">zipfile.ZipFile</span></code>, <code class="docutils literal notranslate"><span class="pre">gzip.GzipFile</span></code>, |
| <code class="docutils literal notranslate"><span class="pre">bz2.BZ2File</span></code>, <code class="docutils literal notranslate"><span class="pre">zstandard.ZstdDecompressor</span></code> or |
| <code class="docutils literal notranslate"><span class="pre">tarfile.TarFile</span></code>, respectively. |
| As an example, the following could be passed for Zstandard decompression using a |
| custom compression dictionary: |
| <code class="docutils literal notranslate"><span class="pre">compression={'method':</span> <span class="pre">'zstd',</span> <span class="pre">'dict_data':</span> <span class="pre">my_compression_dict}</span></code>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.5.0: </span>Added support for <cite>.tar</cite> files.</p> |
| </div> |
| </li> |
| <li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| </li> |
| </ul> |
| </td> |
| </tr> |
| <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first"></p> |
| </td> |
| </tr> |
| <tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last"><a class="reference internal" href="apache_beam.dataframe.frames.html#apache_beam.dataframe.frames.DeferredDataFrame" title="apache_beam.dataframe.frames.DeferredDataFrame">DeferredDataFrame</a> or StataReader</p> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="first admonition-title">See also</p> |
| <dl class="last docutils"> |
| <dt><code class="xref py py-func docutils literal notranslate"><span class="pre">io.stata.StataReader()</span></code></dt> |
| <dd>Low-level reader for Stata data files.</dd> |
| <dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_stata()</span></code></dt> |
| <dd>Export Stata data files.</dd> |
| </dl> |
| </div> |
| <p class="rubric">Notes</p> |
| <p>Categorical variables read through an iterator may not have the same |
| categories and dtype. This occurs when a variable stored in a DTA |
| file is associated to an incomplete set of value labels that only |
| label a strict subset of the values.</p> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="go">Creating a dummy stata for this example</span> |
| |
| <span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">'animal'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'falcon'</span><span class="p">,</span> <span class="s1">'parrot'</span><span class="p">,</span> <span class="s1">'falcon'</span><span class="p">,</span> <span class="s1">'parrot'</span><span class="p">],</span> |
| <span class="gp">... </span> <span class="s1">'speed'</span><span class="p">:</span> <span class="p">[</span><span class="mi">350</span><span class="p">,</span> <span class="mi">18</span><span class="p">,</span> <span class="mi">361</span><span class="p">,</span> <span class="mi">15</span><span class="p">]})</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_stata</span><span class="p">(</span><span class="s1">'animals.dta'</span><span class="p">)</span> |
| |
| <span class="go">Read a Stata dta file:</span> |
| |
| <span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_stata</span><span class="p">(</span><span class="s1">'animals.dta'</span><span class="p">)</span> |
| |
| <span class="go">Read a Stata dta file in 10,000 line chunks:</span> |
| |
| <span class="gp">>>> </span><span class="n">values</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="mi">20_000</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="s2">"uint8"</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s2">"i"</span><span class="p">])</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_stata</span><span class="p">(</span><span class="s1">'filename.dta'</span><span class="p">)</span> |
| |
| <span class="gp">>>> </span><span class="k">with</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_stata</span><span class="p">(</span><span class="s1">'filename.dta'</span><span class="p">,</span> <span class="n">chunksize</span><span class="o">=</span><span class="mi">10000</span><span class="p">)</span> <span class="k">as</span> <span class="n">itr</span><span class="p">:</span> |
| <span class="gp">>>> </span> <span class="k">for</span> <span class="n">chunk</span> <span class="ow">in</span> <span class="n">itr</span><span class="p">:</span> |
| <span class="gp">... </span> <span class="c1"># Operate on a single chunk, e.g., chunk.mean()</span> |
| <span class="gp">... </span> <span class="k">pass</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="function"> |
| <dt id="apache_beam.dataframe.io.to_excel"> |
| <code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">to_excel</code><span class="sig-paren">(</span><em>df</em>, <em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.to_excel" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Write object to an Excel sheet.</p> |
| <p>To write a single object to an Excel .xlsx file it is only necessary to |
| specify a target file name. To write to multiple sheets it is necessary to |
| create an <cite>ExcelWriter</cite> object with a target file name, and specify a sheet |
| in the file to write to.</p> |
| <p>Multiple sheets may be written to by specifying unique <cite>sheet_name</cite>. |
| With all data written to the file it is necessary to save the changes. |
| Note that creating an <cite>ExcelWriter</cite> object with a file name that already |
| exists will result in the contents of the existing file being erased.</p> |
| <table class="docutils field-list" frame="void" rules="none"> |
| <col class="field-name" /> |
| <col class="field-body" /> |
| <tbody valign="top"> |
| <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple"> |
| <li><strong>excel_writer</strong> (<em>path-like</em><em>, </em><em>file-like</em><em>, or </em><em>ExcelWriter object</em>) – File path or existing ExcelWriter.</li> |
| <li><strong>sheet_name</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default 'Sheet1'</em>) – Name of sheet which will contain DeferredDataFrame.</li> |
| <li><strong>na_rep</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default ''</em>) – Missing data representation.</li> |
| <li><strong>float_format</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em>) – Format string for floating point numbers. For example |
| <code class="docutils literal notranslate"><span class="pre">float_format="%.2f"</span></code> will format 0.1234 to 0.12.</li> |
| <li><strong>columns</strong> (<em>sequence</em><em> or </em><em>list of str</em><em>, </em><em>optional</em>) – Columns to write.</li> |
| <li><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em> or </em><em>list of str</em><em>, </em><em>default True</em>) – Write out the column names. If a list of string is given it is |
| assumed to be aliases for the column names.</li> |
| <li><strong>index</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – Write row names (index).</li> |
| <li><strong>index_label</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em> or </em><em>sequence</em><em>, </em><em>optional</em>) – Column label for index column(s) if desired. If not specified, and |
| <cite>header</cite> and <cite>index</cite> are True, then the index names are used. A |
| sequence should be given if the DeferredDataFrame uses MultiIndex.</li> |
| <li><strong>startrow</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>default 0</em>) – Upper left cell row to dump data frame.</li> |
| <li><strong>startcol</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.12)"><em>int</em></a><em>, </em><em>default 0</em>) – Upper left cell column to dump data frame.</li> |
| <li><strong>engine</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em>) – Write engine to use, ‘openpyxl’ or ‘xlsxwriter’. You can also set this |
| via the options <code class="docutils literal notranslate"><span class="pre">io.excel.xlsx.writer</span></code> or |
| <code class="docutils literal notranslate"><span class="pre">io.excel.xlsm.writer</span></code>.</li> |
| <li><strong>merge_cells</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default True</em>) – Write MultiIndex and Hierarchical Rows as merged cells.</li> |
| <li><strong>inf_rep</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>default 'inf'</em>) – Representation for infinity (there is no native representation for |
| infinity in Excel).</li> |
| <li><strong>freeze_panes</strong> (<em>tuple of int</em><em> (</em><em>length 2</em><em>)</em><em>, </em><em>optional</em>) – Specifies the one-based bottommost row and rightmost column that |
| is to be frozen.</li> |
| <li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.2.0.</span></p> |
| </div> |
| </li> |
| </ul> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="first admonition-title">See also</p> |
| <dl class="last docutils"> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.to_csv" title="apache_beam.dataframe.io.to_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">to_csv()</span></code></a></dt> |
| <dd>Write DeferredDataFrame to a comma-separated values (csv) file.</dd> |
| <dt><code class="xref py py-func docutils literal notranslate"><span class="pre">ExcelWriter()</span></code></dt> |
| <dd>Class for writing DeferredDataFrame objects into excel sheets.</dd> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_excel" title="apache_beam.dataframe.io.read_excel"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_excel()</span></code></a></dt> |
| <dd>Read an Excel file into a pandas DeferredDataFrame.</dd> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_csv" title="apache_beam.dataframe.io.read_csv"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_csv()</span></code></a></dt> |
| <dd>Read a comma-separated values (csv) file into DeferredDataFrame.</dd> |
| <dt><code class="xref py py-func docutils literal notranslate"><span class="pre">io.formats.style.Styler.to_excel()</span></code></dt> |
| <dd>Add styles to Excel sheet.</dd> |
| </dl> |
| </div> |
| <p class="rubric">Notes</p> |
| <p>For compatibility with <code class="xref py py-meth docutils literal notranslate"><span class="pre">to_csv()</span></code>, |
| to_excel serializes lists and dicts to strings before writing.</p> |
| <p>Once a workbook has been saved it is not possible to write further |
| data without rewriting the whole workbook.</p> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="go">Create, write to and save a workbook:</span> |
| |
| <span class="gp">>>> </span><span class="n">df1</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">([[</span><span class="s1">'a'</span><span class="p">,</span> <span class="s1">'b'</span><span class="p">],</span> <span class="p">[</span><span class="s1">'c'</span><span class="p">,</span> <span class="s1">'d'</span><span class="p">]],</span> |
| <span class="gp">... </span> <span class="n">index</span><span class="o">=</span><span class="p">[</span><span class="s1">'row 1'</span><span class="p">,</span> <span class="s1">'row 2'</span><span class="p">],</span> |
| <span class="gp">... </span> <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">'col 1'</span><span class="p">,</span> <span class="s1">'col 2'</span><span class="p">])</span> |
| <span class="gp">>>> </span><span class="n">df1</span><span class="o">.</span><span class="n">to_excel</span><span class="p">(</span><span class="s2">"output.xlsx"</span><span class="p">)</span> |
| |
| <span class="go">To specify the sheet name:</span> |
| |
| <span class="gp">>>> </span><span class="n">df1</span><span class="o">.</span><span class="n">to_excel</span><span class="p">(</span><span class="s2">"output.xlsx"</span><span class="p">,</span> |
| <span class="gp">... </span> <span class="n">sheet_name</span><span class="o">=</span><span class="s1">'Sheet_name_1'</span><span class="p">)</span> |
| |
| <span class="go">If you wish to write to more than one sheet in the workbook, it is</span> |
| <span class="go">necessary to specify an ExcelWriter object:</span> |
| |
| <span class="gp">>>> </span><span class="n">df2</span> <span class="o">=</span> <span class="n">df1</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="gp">>>> </span><span class="k">with</span> <span class="n">pd</span><span class="o">.</span><span class="n">ExcelWriter</span><span class="p">(</span><span class="s1">'output.xlsx'</span><span class="p">)</span> <span class="k">as</span> <span class="n">writer</span><span class="p">:</span> |
| <span class="gp">... </span> <span class="n">df1</span><span class="o">.</span><span class="n">to_excel</span><span class="p">(</span><span class="n">writer</span><span class="p">,</span> <span class="n">sheet_name</span><span class="o">=</span><span class="s1">'Sheet_name_1'</span><span class="p">)</span> |
| <span class="gp">... </span> <span class="n">df2</span><span class="o">.</span><span class="n">to_excel</span><span class="p">(</span><span class="n">writer</span><span class="p">,</span> <span class="n">sheet_name</span><span class="o">=</span><span class="s1">'Sheet_name_2'</span><span class="p">)</span> |
| |
| <span class="go">ExcelWriter can also be used to append to an existing Excel file:</span> |
| |
| <span class="gp">>>> </span><span class="k">with</span> <span class="n">pd</span><span class="o">.</span><span class="n">ExcelWriter</span><span class="p">(</span><span class="s1">'output.xlsx'</span><span class="p">,</span> |
| <span class="gp">... </span> <span class="n">mode</span><span class="o">=</span><span class="s1">'a'</span><span class="p">)</span> <span class="k">as</span> <span class="n">writer</span><span class="p">:</span> |
| <span class="gp">... </span> <span class="n">df</span><span class="o">.</span><span class="n">to_excel</span><span class="p">(</span><span class="n">writer</span><span class="p">,</span> <span class="n">sheet_name</span><span class="o">=</span><span class="s1">'Sheet_name_3'</span><span class="p">)</span> |
| |
| <span class="go">To set the library that is used to write the Excel file,</span> |
| <span class="go">you can pass the `engine` keyword (the default engine is</span> |
| <span class="go">automatically chosen depending on the file extension):</span> |
| |
| <span class="gp">>>> </span><span class="n">df1</span><span class="o">.</span><span class="n">to_excel</span><span class="p">(</span><span class="s1">'output1.xlsx'</span><span class="p">,</span> <span class="n">engine</span><span class="o">=</span><span class="s1">'xlsxwriter'</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="function"> |
| <dt id="apache_beam.dataframe.io.to_feather"> |
| <code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">to_feather</code><span class="sig-paren">(</span><em>df</em>, <em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.to_feather" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Write a DataFrame to the binary Feather format.</p> |
| <table class="docutils field-list" frame="void" rules="none"> |
| <col class="field-name" /> |
| <col class="field-body" /> |
| <tbody valign="top"> |
| <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple"> |
| <li><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>path object</em><em>, </em><em>file-like object</em>) – String, path object (implementing <code class="docutils literal notranslate"><span class="pre">os.PathLike[str]</span></code>), or file-like |
| object implementing a binary <code class="docutils literal notranslate"><span class="pre">write()</span></code> function. If a string or a path, |
| it will be used as Root Directory path when writing a partitioned dataset.</li> |
| <li><strong>**kwargs</strong> – <p>Additional keywords passed to <code class="xref py py-func docutils literal notranslate"><span class="pre">pyarrow.feather.write_feather()</span></code>. |
| Starting with pyarrow 0.17, this includes the <cite>compression</cite>, |
| <cite>compression_level</cite>, <cite>chunksize</cite> and <cite>version</cite> keywords.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.1.0.</span></p> |
| </div> |
| </li> |
| </ul> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <p class="rubric">Notes</p> |
| <p>This function writes the dataframe as a <a class="reference external" href="https://arrow.apache.org/docs/python/feather.html">feather file</a>. Requires a default |
| index. For saving the DeferredDataFrame with your custom index use a method that |
| supports custom indices e.g. <cite>to_parquet</cite>.</p> |
| </dd></dl> |
| |
| <dl class="function"> |
| <dt id="apache_beam.dataframe.io.to_parquet"> |
| <code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">to_parquet</code><span class="sig-paren">(</span><em>df</em>, <em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.to_parquet" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Write a DataFrame to the binary parquet format.</p> |
| <p>This function writes the dataframe as a <a class="reference external" href="https://parquet.apache.org/">parquet file</a>. You can choose different parquet |
| backends, and have the option of compression. See |
| <a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#io-parquet" title="(in pandas v3.0.0.dev0+802.g7c836ed2ec)"><span class="xref std std-ref">the user guide</span></a> for more details.</p> |
| <table class="docutils field-list" frame="void" rules="none"> |
| <col class="field-name" /> |
| <col class="field-body" /> |
| <tbody valign="top"> |
| <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple"> |
| <li><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>path object</em><em>, </em><em>file-like object</em><em>, or </em><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.12)"><em>None</em></a><em>, </em><em>default None</em>) – <p>String, path object (implementing <code class="docutils literal notranslate"><span class="pre">os.PathLike[str]</span></code>), or file-like |
| object implementing a binary <code class="docutils literal notranslate"><span class="pre">write()</span></code> function. If None, the result is |
| returned as bytes. If a string or path, it will be used as Root Directory |
| path when writing a partitioned dataset.</p> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.2.0.</span></p> |
| </div> |
| <p>Previously this was “fname”</p> |
| </li> |
| <li><strong>engine</strong> (<em>{'auto'</em><em>, </em><em>'pyarrow'</em><em>, </em><em>'fastparquet'}</em><em>, </em><em>default 'auto'</em>) – Parquet library to use. If ‘auto’, then the option |
| <code class="docutils literal notranslate"><span class="pre">io.parquet.engine</span></code> is used. The default <code class="docutils literal notranslate"><span class="pre">io.parquet.engine</span></code> |
| behavior is to try ‘pyarrow’, falling back to ‘fastparquet’ if |
| ‘pyarrow’ is unavailable.</li> |
| <li><strong>compression</strong> (<em>{'snappy'</em><em>, </em><em>'gzip'</em><em>, </em><em>'brotli'</em><em>, </em><em>None}</em><em>, </em><em>default 'snappy'</em>) – Name of the compression to use. Use <code class="docutils literal notranslate"><span class="pre">None</span></code> for no compression.</li> |
| <li><strong>index</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a><em>, </em><em>default None</em>) – If <code class="docutils literal notranslate"><span class="pre">True</span></code>, include the dataframe’s index(es) in the file output. |
| If <code class="docutils literal notranslate"><span class="pre">False</span></code>, they will not be written to the file. |
| If <code class="docutils literal notranslate"><span class="pre">None</span></code>, similar to <code class="docutils literal notranslate"><span class="pre">True</span></code> the dataframe’s index(es) |
| will be saved. However, instead of being saved as values, |
| the RangeIndex will be stored as a range in the metadata so it |
| doesn’t require much space and is faster. Other indexes will |
| be included as columns in the file output.</li> |
| <li><strong>partition_cols</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.12)"><em>list</em></a><em>, </em><em>optional</em><em>, </em><em>default None</em>) – Column names by which to partition the dataset. |
| Columns are partitioned in the order they are given. |
| Must be None if path is not a string.</li> |
| <li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.2.0.</span></p> |
| </div> |
| </li> |
| <li><strong>**kwargs</strong> – Additional arguments passed to the parquet library. See |
| <a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#io-parquet" title="(in pandas v3.0.0.dev0+802.g7c836ed2ec)"><span class="xref std std-ref">pandas io</span></a> for more details.</li> |
| </ul> |
| </td> |
| </tr> |
| <tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first"></p> |
| </td> |
| </tr> |
| <tr class="field-odd field"><th class="field-name">Return type:</th><td class="field-body"><p class="first last">bytes if no path argument is provided else None</p> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="first admonition-title">See also</p> |
| <dl class="last docutils"> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_parquet" title="apache_beam.dataframe.io.read_parquet"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_parquet()</span></code></a></dt> |
| <dd>Read a parquet file.</dd> |
| <dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_orc()</span></code></dt> |
| <dd>Write an orc file.</dd> |
| <dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_csv()</span></code></dt> |
| <dd>Write a csv file.</dd> |
| <dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_sql()</span></code></dt> |
| <dd>Write to a sql table.</dd> |
| <dt><code class="xref py py-func docutils literal notranslate"><span class="pre">DeferredDataFrame.to_hdf()</span></code></dt> |
| <dd>Write to hdf.</dd> |
| </dl> |
| </div> |
| <p class="rubric">Notes</p> |
| <p>This function requires either the <a class="reference external" href="https://pypi.org/project/fastparquet">fastparquet</a> or <a class="reference external" href="https://arrow.apache.org/docs/python/">pyarrow</a> library.</p> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">data</span><span class="o">=</span><span class="p">{</span><span class="s1">'col1'</span><span class="p">:</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="s1">'col2'</span><span class="p">:</span> <span class="p">[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">]})</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_parquet</span><span class="p">(</span><span class="s1">'df.parquet.gzip'</span><span class="p">,</span> |
| <span class="gp">... </span> <span class="n">compression</span><span class="o">=</span><span class="s1">'gzip'</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">pd</span><span class="o">.</span><span class="n">read_parquet</span><span class="p">(</span><span class="s1">'df.parquet.gzip'</span><span class="p">)</span> |
| <span class="go"> col1 col2</span> |
| <span class="go">0 1 3</span> |
| <span class="go">1 2 4</span> |
| |
| <span class="go">If you want to get a buffer to the parquet content you can use a io.BytesIO</span> |
| <span class="go">object, as long as you don't use partition_cols, which creates multiple files.</span> |
| |
| <span class="gp">>>> </span><span class="kn">import</span> <span class="nn">io</span> |
| <span class="gp">>>> </span><span class="n">f</span> <span class="o">=</span> <span class="n">io</span><span class="o">.</span><span class="n">BytesIO</span><span class="p">()</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_parquet</span><span class="p">(</span><span class="n">f</span><span class="p">)</span> |
| <span class="gp">>>> </span><span class="n">f</span><span class="o">.</span><span class="n">seek</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> |
| <span class="go">0</span> |
| <span class="gp">>>> </span><span class="n">content</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">read</span><span class="p">()</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| <dl class="function"> |
| <dt id="apache_beam.dataframe.io.to_stata"> |
| <code class="descclassname">apache_beam.dataframe.io.</code><code class="descname">to_stata</code><span class="sig-paren">(</span><em>df</em>, <em>path</em>, <em>*args</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="headerlink" href="#apache_beam.dataframe.io.to_stata" title="Permalink to this definition">¶</a></dt> |
| <dd><p>Export DataFrame object to Stata dta format.</p> |
| <p>Writes the DataFrame to a Stata dataset file. |
| “dta” files contain a Stata dataset.</p> |
| <table class="docutils field-list" frame="void" rules="none"> |
| <col class="field-name" /> |
| <col class="field-body" /> |
| <tbody valign="top"> |
| <tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple"> |
| <li><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>path object</em><em>, or </em><em>buffer</em>) – String, path object (implementing <code class="docutils literal notranslate"><span class="pre">os.PathLike[str]</span></code>), or file-like |
| object implementing a binary <code class="docutils literal notranslate"><span class="pre">write()</span></code> function.</li> |
| <li><strong>convert_dates</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a>) – Dictionary mapping columns containing datetime types to stata |
| internal format to use when writing the dates. Options are ‘tc’, |
| ‘td’, ‘tm’, ‘tw’, ‘th’, ‘tq’, ‘ty’. Column can be either an integer |
| or a name. Datetime columns that do not have a conversion type |
| specified will be converted to ‘tc’. Raises NotImplementedError if |
| a datetime column has timezone information.</li> |
| <li><strong>write_index</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.12)"><em>bool</em></a>) – Write the index to Stata dataset.</li> |
| <li><strong>byteorder</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a>) – Can be “>”, “<”, “little”, or “big”. default is <cite>sys.byteorder</cite>.</li> |
| <li><strong>time_stamp</strong> (<em>datetime</em>) – A datetime to use as file creation date. Default is the current |
| time.</li> |
| <li><strong>data_label</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em>, </em><em>optional</em>) – A label for the data set. Must be 80 characters or smaller.</li> |
| <li><strong>variable_labels</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a>) – Dictionary containing columns as keys and variable labels as |
| values. Each label must be 80 characters or smaller.</li> |
| <li><strong>version</strong> (<em>{114</em><em>, </em><em>117</em><em>, </em><em>118</em><em>, </em><em>119</em><em>, </em><em>None}</em><em>, </em><em>default 114</em>) – <p>Version to use in the output dta file. Set to None to let pandas |
| decide between 118 or 119 formats depending on the number of |
| columns in the frame. pandas Version 114 can be read by Stata 10 and |
| later. pandas Version 117 can be read by Stata 13 or later. pandas Version 118 |
| is supported in Stata 14 and later. pandas Version 119 is supported in |
| Stata 15 and later. pandas Version 114 limits string variables to 244 |
| characters or fewer while versions 117 and later allow strings |
| with lengths up to 2,000,000 characters. Versions 118 and 119 |
| support Unicode characters, and pandas version 119 supports more than |
| 32,767 variables.</p> |
| <p>pandas Version 119 should usually only be used when the number of |
| variables exceeds the capacity of dta format 118. Exporting |
| smaller datasets in format 119 may have unintended consequences, |
| and, as of November 2020, Stata SE cannot read pandas version 119 files.</p> |
| </li> |
| <li><strong>convert_strl</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.12)"><em>list</em></a><em>, </em><em>optional</em>) – List of column names to convert to string columns to Stata StrL |
| format. Only available if version is 117. Storing strings in the |
| StrL format can produce smaller dta files if strings have more than |
| 8 characters and values are repeated.</li> |
| <li><strong>compression</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.12)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>default 'infer'</em>) – <p>For on-the-fly compression of the output data. If ‘infer’ and ‘path’ is |
| path-like, then detect compression from the following extensions: ‘.gz’, |
| ‘.bz2’, ‘.zip’, ‘.xz’, ‘.zst’, ‘.tar’, ‘.tar.gz’, ‘.tar.xz’ or ‘.tar.bz2’ |
| (otherwise no compression). |
| Set to <code class="docutils literal notranslate"><span class="pre">None</span></code> for no compression. |
| Can also be a dict with key <code class="docutils literal notranslate"><span class="pre">'method'</span></code> set |
| to one of {<code class="docutils literal notranslate"><span class="pre">'zip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'gzip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'bz2'</span></code>, <code class="docutils literal notranslate"><span class="pre">'zstd'</span></code>, <code class="docutils literal notranslate"><span class="pre">'tar'</span></code>} and other |
| key-value pairs are forwarded to |
| <code class="docutils literal notranslate"><span class="pre">zipfile.ZipFile</span></code>, <code class="docutils literal notranslate"><span class="pre">gzip.GzipFile</span></code>, |
| <code class="docutils literal notranslate"><span class="pre">bz2.BZ2File</span></code>, <code class="docutils literal notranslate"><span class="pre">zstandard.ZstdCompressor</span></code> or |
| <code class="docutils literal notranslate"><span class="pre">tarfile.TarFile</span></code>, respectively. |
| As an example, the following could be passed for faster compression and to create |
| a reproducible gzip archive: |
| <code class="docutils literal notranslate"><span class="pre">compression={'method':</span> <span class="pre">'gzip',</span> <span class="pre">'compresslevel':</span> <span class="pre">1,</span> <span class="pre">'mtime':</span> <span class="pre">1}</span></code>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.5.0: </span>Added support for <cite>.tar</cite> files.</p> |
| </div> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.1.0.</span></p> |
| </div> |
| <div class="versionchanged"> |
| <p><span class="versionmodified">Changed in version 1.4.0: </span>Zstandard support.</p> |
| </div> |
| </li> |
| <li><strong>storage_options</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.12)"><em>dict</em></a><em>, </em><em>optional</em>) – <p>Extra options that make sense for a particular storage connection, e.g. |
| host, port, username, password, etc. For HTTP(S) URLs the key-value pairs |
| are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other |
| URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are |
| forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more |
| details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.2.0.</span></p> |
| </div> |
| </li> |
| <li><strong>value_labels</strong> (<em>dict of dicts</em>) – <p>Dictionary containing columns as keys and dictionaries of column value |
| to labels as values. Labels for a single variable must be 32,000 |
| characters or smaller.</p> |
| <div class="versionadded"> |
| <p><span class="versionmodified">New in version 1.4.0.</span></p> |
| </div> |
| </li> |
| </ul> |
| </td> |
| </tr> |
| <tr class="field-even field"><th class="field-name">Raises:</th><td class="field-body"><ul class="first last"> |
| <li><p class="first"><a class="reference external" href="https://docs.python.org/3/library/exceptions.html#NotImplementedError" title="(in Python v3.12)"><code class="xref py py-exc docutils literal notranslate"><span class="pre">NotImplementedError</span></code></a> – * If datetimes contain timezone information |
| * Column dtype is not representable in Stata</p> |
| </li> |
| <li><dl class="first docutils"> |
| <dt><a class="reference external" href="https://docs.python.org/3/library/exceptions.html#ValueError" title="(in Python v3.12)"><code class="xref py py-exc docutils literal notranslate"><span class="pre">ValueError</span></code></a> – * Columns listed in convert_dates are neither datetime64[ns]</dt> |
| <dd><p class="first last">or datetime.datetime</p> |
| </dd> |
| </dl> |
| <ul class="simple"> |
| <li>Column listed in convert_dates is not in DeferredDataFrame</li> |
| <li>Categorical label contains more than 32,000 characters</li> |
| </ul> |
| </li> |
| </ul> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| <p class="rubric">Differences from pandas</p> |
| <p>This operation has no known divergences from the pandas API.</p> |
| <div class="admonition seealso"> |
| <p class="first admonition-title">See also</p> |
| <dl class="last docutils"> |
| <dt><a class="reference internal" href="#apache_beam.dataframe.io.read_stata" title="apache_beam.dataframe.io.read_stata"><code class="xref py py-func docutils literal notranslate"><span class="pre">read_stata()</span></code></a></dt> |
| <dd>Import Stata data files.</dd> |
| <dt><code class="xref py py-func docutils literal notranslate"><span class="pre">io.stata.StataWriter()</span></code></dt> |
| <dd>Low-level writer for Stata data files.</dd> |
| <dt><code class="xref py py-func docutils literal notranslate"><span class="pre">io.stata.StataWriter117()</span></code></dt> |
| <dd>Low-level writer for pandas version 117 files.</dd> |
| </dl> |
| </div> |
| <p class="rubric">Examples</p> |
| <p><strong>NOTE:</strong> These examples are pulled directly from the pandas documentation for convenience. Usage of the Beam DataFrame API will look different because it is a deferred API.</p> |
| <div class="highlight-pycon notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">'animal'</span><span class="p">:</span> <span class="p">[</span><span class="s1">'falcon'</span><span class="p">,</span> <span class="s1">'parrot'</span><span class="p">,</span> <span class="s1">'falcon'</span><span class="p">,</span> |
| <span class="gp">... </span> <span class="s1">'parrot'</span><span class="p">],</span> |
| <span class="gp">... </span> <span class="s1">'speed'</span><span class="p">:</span> <span class="p">[</span><span class="mi">350</span><span class="p">,</span> <span class="mi">18</span><span class="p">,</span> <span class="mi">361</span><span class="p">,</span> <span class="mi">15</span><span class="p">]})</span> |
| <span class="gp">>>> </span><span class="n">df</span><span class="o">.</span><span class="n">to_stata</span><span class="p">(</span><span class="s1">'animals.dta'</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| </dd></dl> |
| |
| </div> |
| </div> |
| |
| |
| </div> |
| |
| </div> |
| <footer> |
| |
| <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation"> |
| |
| <a href="apache_beam.dataframe.pandas_top_level_functions.html" class="btn btn-neutral float-right" title="apache_beam.dataframe.pandas_top_level_functions module" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a> |
| |
| |
| <a href="apache_beam.dataframe.frames.html" class="btn btn-neutral float-left" title="apache_beam.dataframe.frames module" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a> |
| |
| </div> |
| |
| |
| <hr/> |
| |
| <div role="contentinfo"> |
| <p> |
| © Copyright |
| |
| </p> |
| </div> |
| Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. |
| |
| </footer> |
| |
| </div> |
| </div> |
| |
| </section> |
| |
| </div> |
| |
| |
| |
| <script type="text/javascript"> |
| jQuery(function () { |
| SphinxRtdTheme.Navigation.enable(true); |
| }); |
| </script> |
| |
| |
| |
| |
| |
| |
| </body> |
| </html> |