blob: 81ac9e558baacd976d4a28ce6bc5142a1a105d0e [file] [log] [blame]
<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>apache_beam.io.textio module &mdash; Apache Beam 2.47.0 documentation</title>
<script type="text/javascript" src="_static/js/modernizr.min.js"></script>
<script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
<script type="text/javascript" src="_static/jquery.js"></script>
<script type="text/javascript" src="_static/underscore.js"></script>
<script type="text/javascript" src="_static/doctools.js"></script>
<script type="text/javascript" src="_static/language_data.js"></script>
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/javascript" src="_static/js/theme.js"></script>
<link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
<link rel="stylesheet" href="_static/pygments.css" type="text/css" />
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="apache_beam.io.tfrecordio module" href="apache_beam.io.tfrecordio.html" />
<link rel="prev" title="apache_beam.io.source_test_utils module" href="apache_beam.io.source_test_utils.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="index.html" class="icon icon-home"> Apache Beam
</a>
<div class="version">
2.47.0
</div>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
<input type="text" name="q" placeholder="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="apache_beam.coders.html">apache_beam.coders package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.dataframe.html">apache_beam.dataframe package</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="apache_beam.io.html">apache_beam.io package</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="apache_beam.io.html#subpackages">Subpackages</a></li>
<li class="toctree-l2 current"><a class="reference internal" href="apache_beam.io.html#submodules">Submodules</a><ul class="current">
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.avroio.html">apache_beam.io.avroio module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.concat_source.html">apache_beam.io.concat_source module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.debezium.html">apache_beam.io.debezium module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.filebasedsink.html">apache_beam.io.filebasedsink module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.filebasedsource.html">apache_beam.io.filebasedsource module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.fileio.html">apache_beam.io.fileio module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.filesystem.html">apache_beam.io.filesystem module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.filesystemio.html">apache_beam.io.filesystemio module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.filesystems.html">apache_beam.io.filesystems module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.hadoopfilesystem.html">apache_beam.io.hadoopfilesystem module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.iobase.html">apache_beam.io.iobase module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.jdbc.html">apache_beam.io.jdbc module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.kafka.html">apache_beam.io.kafka module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.kinesis.html">apache_beam.io.kinesis module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.localfilesystem.html">apache_beam.io.localfilesystem module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.mongodbio.html">apache_beam.io.mongodbio module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.parquetio.html">apache_beam.io.parquetio module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.range_trackers.html">apache_beam.io.range_trackers module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.restriction_trackers.html">apache_beam.io.restriction_trackers module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.snowflake.html">apache_beam.io.snowflake module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.source_test_utils.html">apache_beam.io.source_test_utils module</a></li>
<li class="toctree-l3 current"><a class="current reference internal" href="#">apache_beam.io.textio module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.tfrecordio.html">apache_beam.io.tfrecordio module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.utils.html">apache_beam.io.utils module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.watermark_estimators.html">apache_beam.io.watermark_estimators module</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.metrics.html">apache_beam.metrics package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.ml.html">apache_beam.ml package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.options.html">apache_beam.options package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.portability.html">apache_beam.portability package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.runners.html">apache_beam.runners package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.testing.html">apache_beam.testing package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.transforms.html">apache_beam.transforms package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.typehints.html">apache_beam.typehints package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.utils.html">apache_beam.utils package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.yaml.html">apache_beam.yaml package</a></li>
</ul>
<ul>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.error.html">apache_beam.error module</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.pipeline.html">apache_beam.pipeline module</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.pvalue.html">apache_beam.pvalue module</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" aria-label="top navigation">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="index.html">Apache Beam</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="index.html">Docs</a> &raquo;</li>
<li><a href="apache_beam.io.html">apache_beam.io package</a> &raquo;</li>
<li>apache_beam.io.textio module</li>
<li class="wy-breadcrumbs-aside">
<a href="_sources/apache_beam.io.textio.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<div class="section" id="module-apache_beam.io.textio">
<span id="apache-beam-io-textio-module"></span><h1>apache_beam.io.textio module<a class="headerlink" href="#module-apache_beam.io.textio" title="Permalink to this headline"></a></h1>
<p>A source and a sink for reading from and writing to text files.</p>
<dl class="class">
<dt id="apache_beam.io.textio.ReadAllFromText">
<em class="property">class </em><code class="descclassname">apache_beam.io.textio.</code><code class="descname">ReadAllFromText</code><span class="sig-paren">(</span><em>min_bundle_size=0</em>, <em>desired_bundle_size=67108864</em>, <em>compression_type='auto'</em>, <em>strip_trailing_newlines=True</em>, <em>validate=False</em>, <em>coder=StrUtf8Coder</em>, <em>skip_header_lines=0</em>, <em>with_filename=False</em>, <em>delimiter=None</em>, <em>escapechar=None</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/textio.html#ReadAllFromText"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.textio.ReadAllFromText" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="apache_beam.transforms.ptransform.html#apache_beam.transforms.ptransform.PTransform" title="apache_beam.transforms.ptransform.PTransform"><code class="xref py py-class docutils literal notranslate"><span class="pre">apache_beam.transforms.ptransform.PTransform</span></code></a></p>
<p>A <code class="docutils literal notranslate"><span class="pre">PTransform</span></code> for reading a <code class="docutils literal notranslate"><span class="pre">PCollection</span></code> of text files.</p>
<blockquote>
<div>Reads a <code class="docutils literal notranslate"><span class="pre">PCollection</span></code> of text files or file patterns and produces a
<code class="docutils literal notranslate"><span class="pre">PCollection</span></code> of strings.</div></blockquote>
<p>Parses a text file as newline-delimited elements, by default assuming
UTF-8 encoding. Supports newline delimiters ‘n’ and ‘rn’.</p>
<p>If <cite>with_filename</cite> is <code class="docutils literal notranslate"><span class="pre">True</span></code> the output will include the file name. This is
similar to <code class="docutils literal notranslate"><span class="pre">ReadFromTextWithFilename</span></code> but this <code class="docutils literal notranslate"><span class="pre">PTransform</span></code> can be placed
anywhere in the pipeline.</p>
<p>This implementation only supports reading text encoded using UTF-8 or ASCII.
This does not support other encodings such as UTF-16 or UTF-32.</p>
<p>This implementation is only tested with batch pipeline. In streaming,
reading may happen with delay due to the limitation in ReShuffle involved.</p>
<p>Initialize the <code class="docutils literal notranslate"><span class="pre">ReadAllFromText</span></code> transform.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>min_bundle_size</strong> – Minimum size of bundles that should be generated when
splitting this source into bundles. See <code class="docutils literal notranslate"><span class="pre">FileBasedSource</span></code> for more
details.</li>
<li><strong>desired_bundle_size</strong> – Desired size of bundles that should be generated when
splitting this source into bundles. See <code class="docutils literal notranslate"><span class="pre">FileBasedSource</span></code> for more
details.</li>
<li><strong>compression_type</strong> – Used to handle compressed input files. Typical value
is <code class="docutils literal notranslate"><span class="pre">CompressionTypes.AUTO</span></code>, in which case the underlying file_path’s
extension will be used to detect the compression.</li>
<li><strong>strip_trailing_newlines</strong> – Indicates whether this source should remove
the newline char in each line it reads before decoding that line.</li>
<li><strong>validate</strong> – flag to verify that the files exist during the pipeline
creation time.</li>
<li><strong>skip_header_lines</strong> – Number of header lines to skip. Same number is skipped
from each source file. Must be 0 or higher. Large number of skipped
lines might impact performance.</li>
<li><strong>coder</strong> – Coder used to decode each line.</li>
<li><strong>with_filename</strong> – If True, returns a Key Value with the key being the file
name and the value being the actual data. If False, it only returns
the data.</li>
<li><strong>delimiter</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#bytes" title="(in Python v3.11)"><em>bytes</em></a>) – delimiter to split records.
Must not self-overlap, because self-overlapping delimiters cause
ambiguous parsing.</li>
<li><strong>escapechar</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#bytes" title="(in Python v3.11)"><em>bytes</em></a>) – a single byte to escape the records
delimiter, can also escape itself.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<dl class="attribute">
<dt id="apache_beam.io.textio.ReadAllFromText.DEFAULT_DESIRED_BUNDLE_SIZE">
<code class="descname">DEFAULT_DESIRED_BUNDLE_SIZE</code><em class="property"> = 67108864</em><a class="headerlink" href="#apache_beam.io.textio.ReadAllFromText.DEFAULT_DESIRED_BUNDLE_SIZE" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="apache_beam.io.textio.ReadAllFromText.expand">
<code class="descname">expand</code><span class="sig-paren">(</span><em>pvalue</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/textio.html#ReadAllFromText.expand"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.textio.ReadAllFromText.expand" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="class">
<dt id="apache_beam.io.textio.ReadAllFromTextContinuously">
<em class="property">class </em><code class="descclassname">apache_beam.io.textio.</code><code class="descname">ReadAllFromTextContinuously</code><span class="sig-paren">(</span><em>file_pattern</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/textio.html#ReadAllFromTextContinuously"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.textio.ReadAllFromTextContinuously" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="#apache_beam.io.textio.ReadAllFromText" title="apache_beam.io.textio.ReadAllFromText"><code class="xref py py-class docutils literal notranslate"><span class="pre">apache_beam.io.textio.ReadAllFromText</span></code></a></p>
<p>A <code class="docutils literal notranslate"><span class="pre">PTransform</span></code> for reading text files in given file patterns.
This PTransform acts as a Source and produces continuously a <code class="docutils literal notranslate"><span class="pre">PCollection</span></code>
of strings.</p>
<p>For more details, see <code class="docutils literal notranslate"><span class="pre">ReadAllFromText</span></code> for text parsing settings;
see <code class="docutils literal notranslate"><span class="pre">apache_beam.io.fileio.MatchContinuously</span></code> for watching settings.</p>
<p>ReadAllFromTextContinuously is experimental. No backwards-compatibility
guarantees. Due to the limitation on Reshuffle, current implementation does
not scale.</p>
<p>Initialize the <code class="docutils literal notranslate"><span class="pre">ReadAllFromTextContinuously</span></code> transform.</p>
<p>Accepts args for constructor args of both <a class="reference internal" href="#apache_beam.io.textio.ReadAllFromText" title="apache_beam.io.textio.ReadAllFromText"><code class="xref py py-class docutils literal notranslate"><span class="pre">ReadAllFromText</span></code></a> and
<a class="reference internal" href="apache_beam.io.fileio.html#apache_beam.io.fileio.MatchContinuously" title="apache_beam.io.fileio.MatchContinuously"><code class="xref py py-class docutils literal notranslate"><span class="pre">MatchContinuously</span></code></a>.</p>
<dl class="method">
<dt id="apache_beam.io.textio.ReadAllFromTextContinuously.expand">
<code class="descname">expand</code><span class="sig-paren">(</span><em>pbegin</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/textio.html#ReadAllFromTextContinuously.expand"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.textio.ReadAllFromTextContinuously.expand" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="class">
<dt id="apache_beam.io.textio.ReadFromText">
<em class="property">class </em><code class="descclassname">apache_beam.io.textio.</code><code class="descname">ReadFromText</code><span class="sig-paren">(</span><em>file_pattern=None</em>, <em>min_bundle_size=0</em>, <em>compression_type='auto'</em>, <em>strip_trailing_newlines=True</em>, <em>coder=StrUtf8Coder</em>, <em>validate=True</em>, <em>skip_header_lines=0</em>, <em>delimiter=None</em>, <em>escapechar=None</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/textio.html#ReadFromText"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.textio.ReadFromText" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="apache_beam.transforms.ptransform.html#apache_beam.transforms.ptransform.PTransform" title="apache_beam.transforms.ptransform.PTransform"><code class="xref py py-class docutils literal notranslate"><span class="pre">apache_beam.transforms.ptransform.PTransform</span></code></a></p>
<p>A <a class="reference internal" href="apache_beam.transforms.ptransform.html#apache_beam.transforms.ptransform.PTransform" title="apache_beam.transforms.ptransform.PTransform"><code class="xref py py-class docutils literal notranslate"><span class="pre">PTransform</span></code></a> for reading text
files.</p>
<p>Parses a text file as newline-delimited elements, by default assuming
<code class="docutils literal notranslate"><span class="pre">UTF-8</span></code> encoding. Supports newline delimiters <code class="docutils literal notranslate"><span class="pre">\n</span></code> and <code class="docutils literal notranslate"><span class="pre">\r\n</span></code>
or specified delimiter .</p>
<p>This implementation only supports reading text encoded using <code class="docutils literal notranslate"><span class="pre">UTF-8</span></code> or
<code class="docutils literal notranslate"><span class="pre">ASCII</span></code>.
This does not support other encodings such as <code class="docutils literal notranslate"><span class="pre">UTF-16</span></code> or <code class="docutils literal notranslate"><span class="pre">UTF-32</span></code>.</p>
<p>Initialize the <a class="reference internal" href="#apache_beam.io.textio.ReadFromText" title="apache_beam.io.textio.ReadFromText"><code class="xref py py-class docutils literal notranslate"><span class="pre">ReadFromText</span></code></a> transform.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>file_pattern</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a>) – The file path to read from as a local file path or a
GCS <code class="docutils literal notranslate"><span class="pre">gs://</span></code> path. The path can contain glob characters
(<code class="docutils literal notranslate"><span class="pre">*</span></code>, <code class="docutils literal notranslate"><span class="pre">?</span></code>, and <code class="docutils literal notranslate"><span class="pre">[...]</span></code> sets).</li>
<li><strong>min_bundle_size</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><em>int</em></a>) – Minimum size of bundles that should be generated
when splitting this source into bundles. See
<a class="reference internal" href="apache_beam.io.filebasedsource.html#apache_beam.io.filebasedsource.FileBasedSource" title="apache_beam.io.filebasedsource.FileBasedSource"><code class="xref py py-class docutils literal notranslate"><span class="pre">FileBasedSource</span></code></a> for more
details.</li>
<li><strong>compression_type</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a>) – Used to handle compressed input files.
Typical value is <a class="reference internal" href="apache_beam.io.filesystem.html#apache_beam.io.filesystem.CompressionTypes.AUTO" title="apache_beam.io.filesystem.CompressionTypes.AUTO"><code class="xref py py-attr docutils literal notranslate"><span class="pre">CompressionTypes.AUTO</span></code></a>, in which case the
underlying file_path’s extension will be used to detect the compression.</li>
<li><strong>strip_trailing_newlines</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.11)"><em>bool</em></a>) – Indicates whether this source should
remove the newline char in each line it reads before decoding that line.</li>
<li><strong>validate</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.11)"><em>bool</em></a>) – flag to verify that the files exist during the pipeline
creation time.</li>
<li><strong>skip_header_lines</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><em>int</em></a>) – Number of header lines to skip. Same number is
skipped from each source file. Must be 0 or higher. Large number of
skipped lines might impact performance.</li>
<li><strong>coder</strong> (<a class="reference internal" href="apache_beam.coders.coders.html#apache_beam.coders.coders.Coder" title="apache_beam.coders.coders.Coder"><em>Coder</em></a>) – Coder used to decode each line.</li>
<li><strong>delimiter</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#bytes" title="(in Python v3.11)"><em>bytes</em></a>) – delimiter to split records.
Must not self-overlap, because self-overlapping delimiters cause
ambiguous parsing.</li>
<li><strong>escapechar</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#bytes" title="(in Python v3.11)"><em>bytes</em></a>) – a single byte to escape the records
delimiter, can also escape itself.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<dl class="method">
<dt id="apache_beam.io.textio.ReadFromText.expand">
<code class="descname">expand</code><span class="sig-paren">(</span><em>pvalue</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/textio.html#ReadFromText.expand"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.textio.ReadFromText.expand" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="class">
<dt id="apache_beam.io.textio.ReadFromTextWithFilename">
<em class="property">class </em><code class="descclassname">apache_beam.io.textio.</code><code class="descname">ReadFromTextWithFilename</code><span class="sig-paren">(</span><em>file_pattern=None</em>, <em>min_bundle_size=0</em>, <em>compression_type='auto'</em>, <em>strip_trailing_newlines=True</em>, <em>coder=StrUtf8Coder</em>, <em>validate=True</em>, <em>skip_header_lines=0</em>, <em>delimiter=None</em>, <em>escapechar=None</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/textio.html#ReadFromTextWithFilename"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.textio.ReadFromTextWithFilename" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="#apache_beam.io.textio.ReadFromText" title="apache_beam.io.textio.ReadFromText"><code class="xref py py-class docutils literal notranslate"><span class="pre">apache_beam.io.textio.ReadFromText</span></code></a></p>
<p>A <a class="reference internal" href="#apache_beam.io.textio.ReadFromText" title="apache_beam.io.textio.ReadFromText"><code class="xref py py-class docutils literal notranslate"><span class="pre">ReadFromText</span></code></a> for reading text
files returning the name of the file and the content of the file.</p>
<p>This class extend ReadFromText class just setting a different
_source_class attribute.</p>
<p>Initialize the <a class="reference internal" href="#apache_beam.io.textio.ReadFromText" title="apache_beam.io.textio.ReadFromText"><code class="xref py py-class docutils literal notranslate"><span class="pre">ReadFromText</span></code></a> transform.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>file_pattern</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a>) – The file path to read from as a local file path or a
GCS <code class="docutils literal notranslate"><span class="pre">gs://</span></code> path. The path can contain glob characters
(<code class="docutils literal notranslate"><span class="pre">*</span></code>, <code class="docutils literal notranslate"><span class="pre">?</span></code>, and <code class="docutils literal notranslate"><span class="pre">[...]</span></code> sets).</li>
<li><strong>min_bundle_size</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><em>int</em></a>) – Minimum size of bundles that should be generated
when splitting this source into bundles. See
<a class="reference internal" href="apache_beam.io.filebasedsource.html#apache_beam.io.filebasedsource.FileBasedSource" title="apache_beam.io.filebasedsource.FileBasedSource"><code class="xref py py-class docutils literal notranslate"><span class="pre">FileBasedSource</span></code></a> for more
details.</li>
<li><strong>compression_type</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a>) – Used to handle compressed input files.
Typical value is <a class="reference internal" href="apache_beam.io.filesystem.html#apache_beam.io.filesystem.CompressionTypes.AUTO" title="apache_beam.io.filesystem.CompressionTypes.AUTO"><code class="xref py py-attr docutils literal notranslate"><span class="pre">CompressionTypes.AUTO</span></code></a>, in which case the
underlying file_path’s extension will be used to detect the compression.</li>
<li><strong>strip_trailing_newlines</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.11)"><em>bool</em></a>) – Indicates whether this source should
remove the newline char in each line it reads before decoding that line.</li>
<li><strong>validate</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.11)"><em>bool</em></a>) – flag to verify that the files exist during the pipeline
creation time.</li>
<li><strong>skip_header_lines</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><em>int</em></a>) – Number of header lines to skip. Same number is
skipped from each source file. Must be 0 or higher. Large number of
skipped lines might impact performance.</li>
<li><strong>coder</strong> (<a class="reference internal" href="apache_beam.coders.coders.html#apache_beam.coders.coders.Coder" title="apache_beam.coders.coders.Coder"><em>Coder</em></a>) – Coder used to decode each line.</li>
<li><strong>delimiter</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#bytes" title="(in Python v3.11)"><em>bytes</em></a>) – delimiter to split records.
Must not self-overlap, because self-overlapping delimiters cause
ambiguous parsing.</li>
<li><strong>escapechar</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#bytes" title="(in Python v3.11)"><em>bytes</em></a>) – a single byte to escape the records
delimiter, can also escape itself.</li>
</ul>
</td>
</tr>
</tbody>
</table>
</dd></dl>
<dl class="class">
<dt id="apache_beam.io.textio.WriteToText">
<em class="property">class </em><code class="descclassname">apache_beam.io.textio.</code><code class="descname">WriteToText</code><span class="sig-paren">(</span><em>file_path_prefix</em>, <em>file_name_suffix=''</em>, <em>append_trailing_newlines=True</em>, <em>num_shards=0</em>, <em>shard_name_template=None</em>, <em>coder=ToBytesCoder</em>, <em>compression_type='auto'</em>, <em>header=None</em>, <em>footer=None</em>, <em>*</em>, <em>max_records_per_shard=None</em>, <em>max_bytes_per_shard=None</em>, <em>skip_if_empty=False</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/textio.html#WriteToText"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.textio.WriteToText" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="apache_beam.transforms.ptransform.html#apache_beam.transforms.ptransform.PTransform" title="apache_beam.transforms.ptransform.PTransform"><code class="xref py py-class docutils literal notranslate"><span class="pre">apache_beam.transforms.ptransform.PTransform</span></code></a></p>
<p>A <a class="reference internal" href="apache_beam.transforms.ptransform.html#apache_beam.transforms.ptransform.PTransform" title="apache_beam.transforms.ptransform.PTransform"><code class="xref py py-class docutils literal notranslate"><span class="pre">PTransform</span></code></a> for writing to
text files.</p>
<p>Initialize a <a class="reference internal" href="#apache_beam.io.textio.WriteToText" title="apache_beam.io.textio.WriteToText"><code class="xref py py-class docutils literal notranslate"><span class="pre">WriteToText</span></code></a> transform.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>file_path_prefix</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a>) – The file path to write to. The files written will
begin with this prefix, followed by a shard identifier (see
<strong>num_shards</strong>), and end in a common extension, if given by
<strong>file_name_suffix</strong>. In most cases, only this argument is specified and
<strong>num_shards</strong>, <strong>shard_name_template</strong>, and <strong>file_name_suffix</strong> use
default values.</li>
<li><strong>file_name_suffix</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a>) – Suffix for the files written.</li>
<li><strong>append_trailing_newlines</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.11)"><em>bool</em></a>) – indicate whether this sink should write
an additional newline char after writing each element.</li>
<li><strong>num_shards</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><em>int</em></a>) – The number of files (shards) used for output.
If not set, the service will decide on the optimal number of shards.
Constraining the number of shards is likely to reduce
the performance of a pipeline. Setting this value is not recommended
unless you require a specific number of output files.</li>
<li><strong>shard_name_template</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a>) – A template string containing placeholders for
the shard number and shard count. Currently only <code class="docutils literal notranslate"><span class="pre">''</span></code> and
<code class="docutils literal notranslate"><span class="pre">'-SSSSS-of-NNNNN'</span></code> are patterns accepted by the service.
When constructing a filename for a particular shard number, the
upper-case letters <code class="docutils literal notranslate"><span class="pre">S</span></code> and <code class="docutils literal notranslate"><span class="pre">N</span></code> are replaced with the <code class="docutils literal notranslate"><span class="pre">0</span></code>-padded
shard number and shard count respectively. This argument can be <code class="docutils literal notranslate"><span class="pre">''</span></code>
in which case it behaves as if num_shards was set to 1 and only one file
will be generated. The default pattern used is <code class="docutils literal notranslate"><span class="pre">'-SSSSS-of-NNNNN'</span></code>.</li>
<li><strong>coder</strong> (<a class="reference internal" href="apache_beam.coders.coders.html#apache_beam.coders.coders.Coder" title="apache_beam.coders.coders.Coder"><em>Coder</em></a>) – Coder used to encode each line.</li>
<li><strong>compression_type</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a>) – Used to handle compressed output files.
Typical value is <a class="reference internal" href="apache_beam.io.filesystem.html#apache_beam.io.filesystem.CompressionTypes.AUTO" title="apache_beam.io.filesystem.CompressionTypes.AUTO"><code class="xref py py-class docutils literal notranslate"><span class="pre">CompressionTypes.AUTO</span></code></a>, in which case the
final file path’s extension (as determined by <strong>file_path_prefix</strong>,
<strong>file_name_suffix</strong>, <strong>num_shards</strong> and <strong>shard_name_template</strong>) will
be used to detect the compression.</li>
<li><strong>header</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a>) – String to write at beginning of file as a header.
If not <a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.11)"><code class="xref py py-data docutils literal notranslate"><span class="pre">None</span></code></a> and <strong>append_trailing_newlines</strong> is set, <code class="docutils literal notranslate"><span class="pre">\n</span></code> will
be added.</li>
<li><strong>footer</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a>) – String to write at the end of file as a footer.
If not <a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.11)"><code class="xref py py-data docutils literal notranslate"><span class="pre">None</span></code></a> and <strong>append_trailing_newlines</strong> is set, <code class="docutils literal notranslate"><span class="pre">\n</span></code> will
be added.</li>
<li><strong>max_records_per_shard</strong> – Maximum number of records to write to any
individual shard.</li>
<li><strong>max_bytes_per_shard</strong> – Target maximum number of bytes to write to any
individual shard. This may be exceeded slightly, as a new shard is
created once this limit is hit, but the remainder of a given record, a
subsequent newline, and a footer may cause the actual shard size
to exceed this value. This also tracks the uncompressed,
not compressed, size of the shard.</li>
<li><strong>skip_if_empty</strong> – Don’t write any shards if the PCollection is empty.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<dl class="method">
<dt id="apache_beam.io.textio.WriteToText.expand">
<code class="descname">expand</code><span class="sig-paren">(</span><em>pcoll</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/textio.html#WriteToText.expand"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.textio.WriteToText.expand" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="function">
<dt id="apache_beam.io.textio.ReadFromCsv">
<code class="descclassname">apache_beam.io.textio.</code><code class="descname">ReadFromCsv</code><span class="sig-paren">(</span><em>path: str</em>, <em>*</em>, <em>splittable: bool = True</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/textio.html#ReadFromCsv"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.textio.ReadFromCsv" title="Permalink to this definition"></a></dt>
<dd><dl class="docutils">
<dt>A PTransform for reading comma-separated values (csv) files into a</dt>
<dd><p class="first">PCollection.</p>
<dl class="last docutils">
<dt>Args:</dt>
<dd><dl class="first docutils">
<dt>path (str): The file path to read from. The path can contain glob</dt>
<dd>characters such as <code class="docutils literal notranslate"><span class="pre">*</span></code> and <code class="docutils literal notranslate"><span class="pre">?</span></code>.</dd>
<dt>splittable (bool): Whether the csv files are splittable at line</dt>
<dd>boundaries, i.e. each line of this file represents a complete record.
This should be set to False if single records span multiple lines (e.g.
a quoted field has a newline inside of it). Setting this to false may
disable liquid sharding.</dd>
</dl>
<p class="last"><a href="#id1"><span class="problematic" id="id2">**</span></a>kwargs: Extra arguments passed to <cite>pandas.read_csv</cite> (see below).</p>
</dd>
</dl>
</dd>
</dl>
<dl class="docutils">
<dt>sep <span class="classifier-delimiter">:</span> <span class="classifier">str, default ‘,’</span></dt>
<dd>Delimiter to use. If sep is None, the C engine cannot automatically detect
the separator, but the Python parsing engine can, meaning the latter will
be used and automatically detect the separator by Python’s builtin sniffer
tool, <code class="docutils literal notranslate"><span class="pre">csv.Sniffer</span></code>. In addition, separators longer than 1 character and
different from <code class="docutils literal notranslate"><span class="pre">'\s+'</span></code> will be interpreted as regular expressions and
will also force the use of the Python parsing engine. Note that regex
delimiters are prone to ignoring quoted data. Regex example: <code class="docutils literal notranslate"><span class="pre">'\r\t'</span></code>.</dd>
<dt>delimiter <span class="classifier-delimiter">:</span> <span class="classifier">str, default <code class="docutils literal notranslate"><span class="pre">None</span></code></span></dt>
<dd>Alias for sep.</dd>
<dt>header <span class="classifier-delimiter">:</span> <span class="classifier">int, list of int, None, default ‘infer’</span></dt>
<dd>Row number(s) to use as the column names, and the start of the
data. Default behavior is to infer the column names: if no names
are passed the behavior is identical to <code class="docutils literal notranslate"><span class="pre">header=0</span></code> and column
names are inferred from the first line of the file, if column
names are passed explicitly then the behavior is identical to
<code class="docutils literal notranslate"><span class="pre">header=None</span></code>. Explicitly pass <code class="docutils literal notranslate"><span class="pre">header=0</span></code> to be able to
replace existing names. The header can be a list of integers that
specify row locations for a multi-index on the columns
e.g. [0,1,3]. Intervening rows that are not specified will be
skipped (e.g. 2 in this example is skipped). Note that this
parameter ignores commented lines and empty lines if
<code class="docutils literal notranslate"><span class="pre">skip_blank_lines=True</span></code>, so <code class="docutils literal notranslate"><span class="pre">header=0</span></code> denotes the first line of
data rather than the first line of the file.</dd>
<dt>names <span class="classifier-delimiter">:</span> <span class="classifier">array-like, optional</span></dt>
<dd>List of column names to use. If the file contains a header row,
then you should explicitly pass <code class="docutils literal notranslate"><span class="pre">header=0</span></code> to override the column names.
Duplicates in this list are not allowed.</dd>
<dt>index_col <span class="classifier-delimiter">:</span> <span class="classifier">int, str, sequence of int / str, or False, optional, default <code class="docutils literal notranslate"><span class="pre">None</span></code></span></dt>
<dd><p class="first">Column(s) to use as the row labels of the <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code>, either given as
string name or column index. If a sequence of int / str is given, a
MultiIndex is used.</p>
<p class="last">Note: <code class="docutils literal notranslate"><span class="pre">index_col=False</span></code> can be used to force pandas to <em>not</em> use the first
column as the index, e.g. when you have a malformed file with delimiters at
the end of each line.</p>
</dd>
<dt>usecols <span class="classifier-delimiter">:</span> <span class="classifier">list-like or callable, optional</span></dt>
<dd><p class="first">Return a subset of the columns. If list-like, all elements must either
be positional (i.e. integer indices into the document columns) or strings
that correspond to column names provided either by the user in <cite>names</cite> or
inferred from the document header row(s). If <code class="docutils literal notranslate"><span class="pre">names</span></code> are given, the document
header row(s) are not taken into account. For example, a valid list-like
<cite>usecols</cite> parameter would be <code class="docutils literal notranslate"><span class="pre">[0,</span> <span class="pre">1,</span> <span class="pre">2]</span></code> or <code class="docutils literal notranslate"><span class="pre">['foo',</span> <span class="pre">'bar',</span> <span class="pre">'baz']</span></code>.
Element order is ignored, so <code class="docutils literal notranslate"><span class="pre">usecols=[0,</span> <span class="pre">1]</span></code> is the same as <code class="docutils literal notranslate"><span class="pre">[1,</span> <span class="pre">0]</span></code>.
To instantiate a DataFrame from <code class="docutils literal notranslate"><span class="pre">data</span></code> with element order preserved use
<code class="docutils literal notranslate"><span class="pre">pd.read_csv(data,</span> <span class="pre">usecols=['foo',</span> <span class="pre">'bar'])[['foo',</span> <span class="pre">'bar']]</span></code> for columns
in <code class="docutils literal notranslate"><span class="pre">['foo',</span> <span class="pre">'bar']</span></code> order or
<code class="docutils literal notranslate"><span class="pre">pd.read_csv(data,</span> <span class="pre">usecols=['foo',</span> <span class="pre">'bar'])[['bar',</span> <span class="pre">'foo']]</span></code>
for <code class="docutils literal notranslate"><span class="pre">['bar',</span> <span class="pre">'foo']</span></code> order.</p>
<p class="last">If callable, the callable function will be evaluated against the column
names, returning names where the callable function evaluates to True. An
example of a valid callable argument would be <code class="docutils literal notranslate"><span class="pre">lambda</span> <span class="pre">x:</span> <span class="pre">x.upper()</span> <span class="pre">in</span>
<span class="pre">['AAA',</span> <span class="pre">'BBB',</span> <span class="pre">'DDD']</span></code>. Using this parameter results in much faster
parsing time and lower memory usage.</p>
</dd>
<dt>squeeze <span class="classifier-delimiter">:</span> <span class="classifier">bool, default False</span></dt>
<dd><p class="first">If the parsed data only contains one column then return a Series.</p>
<div class="last deprecated">
<p><span class="versionmodified">Deprecated since version 1.4.0: </span>Append <code class="docutils literal notranslate"><span class="pre">.squeeze(&quot;columns&quot;)</span></code> to the call to <code class="docutils literal notranslate"><span class="pre">read_csv</span></code> to squeeze
the data.</p>
</div>
</dd>
<dt>prefix <span class="classifier-delimiter">:</span> <span class="classifier">str, optional</span></dt>
<dd><p class="first">Prefix to add to column numbers when no header, e.g. ‘X’ for X0, X1, …</p>
<div class="last deprecated">
<p><span class="versionmodified">Deprecated since version 1.4.0: </span>Use a list comprehension on the DataFrame’s columns after calling <code class="docutils literal notranslate"><span class="pre">read_csv</span></code>.</p>
</div>
</dd>
<dt>mangle_dupe_cols <span class="classifier-delimiter">:</span> <span class="classifier">bool, default True</span></dt>
<dd><p class="first">Duplicate columns will be specified as ‘X’, ‘X.1’, …’X.N’, rather than
‘X’…’X’. Passing in False will cause data to be overwritten if there
are duplicate names in the columns.</p>
<div class="last deprecated">
<p><span class="versionmodified">Deprecated since version 1.5.0: </span>Not implemented, and a new argument to specify the pattern for the
names of duplicated columns will be added instead</p>
</div>
</dd>
<dt>dtype <span class="classifier-delimiter">:</span> <span class="classifier">Type name or dict of column -&gt; type, optional</span></dt>
<dd><p class="first">Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32,
‘c’: ‘Int64’}
Use <cite>str</cite> or <cite>object</cite> together with suitable <cite>na_values</cite> settings
to preserve and not interpret dtype.
If converters are specified, they will be applied INSTEAD
of dtype conversion.</p>
<div class="last versionadded">
<p><span class="versionmodified">New in version 1.5.0: </span>Support for defaultdict was added. Specify a defaultdict as input where
the default determines the dtype of the columns which are not explicitly
listed.</p>
</div>
</dd>
<dt>engine <span class="classifier-delimiter">:</span> <span class="classifier">{‘c’, ‘python’, ‘pyarrow’}, optional</span></dt>
<dd><p class="first">Parser engine to use. The C and pyarrow engines are faster, while the python engine
is currently more feature-complete. Multithreading is currently only supported by
the pyarrow engine.</p>
<div class="last versionadded">
<p><span class="versionmodified">New in version 1.4.0: </span>The “pyarrow” engine was added as an <em>experimental</em> engine, and some features
are unsupported, or may not work correctly, with this engine.</p>
</div>
</dd>
<dt>converters <span class="classifier-delimiter">:</span> <span class="classifier">dict, optional</span></dt>
<dd>Dict of functions for converting values in certain columns. Keys can either
be integers or column labels.</dd>
<dt>true_values <span class="classifier-delimiter">:</span> <span class="classifier">list, optional</span></dt>
<dd>Values to consider as True.</dd>
<dt>false_values <span class="classifier-delimiter">:</span> <span class="classifier">list, optional</span></dt>
<dd>Values to consider as False.</dd>
<dt>skipinitialspace <span class="classifier-delimiter">:</span> <span class="classifier">bool, default False</span></dt>
<dd>Skip spaces after delimiter.</dd>
<dt>skiprows <span class="classifier-delimiter">:</span> <span class="classifier">list-like, int or callable, optional</span></dt>
<dd><p class="first">Line numbers to skip (0-indexed) or number of lines to skip (int)
at the start of the file.</p>
<p class="last">If callable, the callable function will be evaluated against the row
indices, returning True if the row should be skipped and False otherwise.
An example of a valid callable argument would be <code class="docutils literal notranslate"><span class="pre">lambda</span> <span class="pre">x:</span> <span class="pre">x</span> <span class="pre">in</span> <span class="pre">[0,</span> <span class="pre">2]</span></code>.</p>
</dd>
<dt>skipfooter <span class="classifier-delimiter">:</span> <span class="classifier">int, default 0</span></dt>
<dd>Number of lines at bottom of file to skip (Unsupported with engine=’c’).</dd>
<dt>nrows <span class="classifier-delimiter">:</span> <span class="classifier">int, optional</span></dt>
<dd>Number of rows of file to read. Useful for reading pieces of large files.</dd>
<dt>na_values <span class="classifier-delimiter">:</span> <span class="classifier">scalar, str, list-like, or dict, optional</span></dt>
<dd>Additional strings to recognize as NA/NaN. If dict passed, specific
per-column NA values. By default the following values are interpreted as
NaN: ‘’, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, ‘-nan’,
‘1.#IND’, ‘1.#QNAN’, ‘&lt;NA&gt;’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘n/a’,
‘nan’, ‘null’.</dd>
<dt>keep_default_na <span class="classifier-delimiter">:</span> <span class="classifier">bool, default True</span></dt>
<dd><p class="first">Whether or not to include the default NaN values when parsing the data.
Depending on whether <cite>na_values</cite> is passed in, the behavior is as follows:</p>
<ul class="simple">
<li>If <cite>keep_default_na</cite> is True, and <cite>na_values</cite> are specified, <cite>na_values</cite>
is appended to the default NaN values used for parsing.</li>
<li>If <cite>keep_default_na</cite> is True, and <cite>na_values</cite> are not specified, only
the default NaN values are used for parsing.</li>
<li>If <cite>keep_default_na</cite> is False, and <cite>na_values</cite> are specified, only
the NaN values specified <cite>na_values</cite> are used for parsing.</li>
<li>If <cite>keep_default_na</cite> is False, and <cite>na_values</cite> are not specified, no
strings will be parsed as NaN.</li>
</ul>
<p class="last">Note that if <cite>na_filter</cite> is passed in as False, the <cite>keep_default_na</cite> and
<cite>na_values</cite> parameters will be ignored.</p>
</dd>
<dt>na_filter <span class="classifier-delimiter">:</span> <span class="classifier">bool, default True</span></dt>
<dd>Detect missing value markers (empty strings and the value of na_values). In
data without any NAs, passing na_filter=False can improve the performance
of reading a large file.</dd>
<dt>verbose <span class="classifier-delimiter">:</span> <span class="classifier">bool, default False</span></dt>
<dd>Indicate number of NA values placed in non-numeric columns.</dd>
<dt>skip_blank_lines <span class="classifier-delimiter">:</span> <span class="classifier">bool, default True</span></dt>
<dd>If True, skip over blank lines rather than interpreting as NaN values.</dd>
<dt>parse_dates <span class="classifier-delimiter">:</span> <span class="classifier">bool or list of int or names or list of lists or dict, default False</span></dt>
<dd><p class="first">The behavior is as follows:</p>
<ul class="simple">
<li>boolean. If True -&gt; try parsing the index.</li>
<li>list of int or names. e.g. If [1, 2, 3] -&gt; try parsing columns 1, 2, 3
each as a separate date column.</li>
<li>list of lists. e.g. If [[1, 3]] -&gt; combine columns 1 and 3 and parse as
a single date column.</li>
<li>dict, e.g. {‘foo’ : [1, 3]} -&gt; parse columns 1, 3 as date and call
result ‘foo’</li>
</ul>
<p>If a column or index cannot be represented as an array of datetimes,
say because of an unparsable value or a mixture of timezones, the column
or index will be returned unaltered as an object data type. For
non-standard datetime parsing, use <code class="docutils literal notranslate"><span class="pre">pd.to_datetime</span></code> after
<code class="docutils literal notranslate"><span class="pre">pd.read_csv</span></code>. To parse an index or column with a mixture of timezones,
specify <code class="docutils literal notranslate"><span class="pre">date_parser</span></code> to be a partially-applied
<a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.to_datetime.html#pandas.to_datetime" title="(in pandas v2.1.0.dev0+589.g961f9c4d78)"><code class="xref py py-func docutils literal notranslate"><span class="pre">pandas.to_datetime()</span></code></a> with <code class="docutils literal notranslate"><span class="pre">utc=True</span></code>. See
<a class="reference external" href="http://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#io-csv-mixed-timezones" title="(in pandas v2.1.0.dev0+589.g961f9c4d78)"><span>Parsing a CSV with mixed timezones</span></a> for more.</p>
<p class="last">Note: A fast-path exists for iso8601-formatted dates.</p>
</dd>
<dt>infer_datetime_format <span class="classifier-delimiter">:</span> <span class="classifier">bool, default False</span></dt>
<dd>If True and <cite>parse_dates</cite> is enabled, pandas will attempt to infer the
format of the datetime strings in the columns, and if it can be inferred,
switch to a faster method of parsing them. In some cases this can increase
the parsing speed by 5-10x.</dd>
<dt>keep_date_col <span class="classifier-delimiter">:</span> <span class="classifier">bool, default False</span></dt>
<dd>If True and <cite>parse_dates</cite> specifies combining multiple columns then
keep the original columns.</dd>
<dt>date_parser <span class="classifier-delimiter">:</span> <span class="classifier">function, optional</span></dt>
<dd>Function to use for converting a sequence of string columns to an array of
datetime instances. The default uses <code class="docutils literal notranslate"><span class="pre">dateutil.parser.parser</span></code> to do the
conversion. Pandas will try to call <cite>date_parser</cite> in three different ways,
advancing to the next if an exception occurs: 1) Pass one or more arrays
(as defined by <cite>parse_dates</cite>) as arguments; 2) concatenate (row-wise) the
string values from the columns defined by <cite>parse_dates</cite> into a single array
and pass that; and 3) call <cite>date_parser</cite> once for each row using one or
more strings (corresponding to the columns defined by <cite>parse_dates</cite>) as
arguments.</dd>
<dt>dayfirst <span class="classifier-delimiter">:</span> <span class="classifier">bool, default False</span></dt>
<dd>DD/MM format dates, international and European format.</dd>
<dt>cache_dates <span class="classifier-delimiter">:</span> <span class="classifier">bool, default True</span></dt>
<dd><p class="first">If True, use a cache of unique, converted dates to apply the datetime
conversion. May produce significant speed-up when parsing duplicate
date strings, especially ones with timezone offsets.</p>
<div class="last versionadded">
<p><span class="versionmodified">New in version 0.25.0.</span></p>
</div>
</dd>
<dt>chunksize <span class="classifier-delimiter">:</span> <span class="classifier">int, optional</span></dt>
<dd><p class="first">Return TextFileReader object for iteration.
See the <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking">IO Tools docs</a>
for more information on <code class="docutils literal notranslate"><span class="pre">iterator</span></code> and <code class="docutils literal notranslate"><span class="pre">chunksize</span></code>.</p>
<div class="last versionchanged">
<p><span class="versionmodified">Changed in version 1.2: </span><code class="docutils literal notranslate"><span class="pre">TextFileReader</span></code> is a context manager.</p>
</div>
</dd>
<dt>compression <span class="classifier-delimiter">:</span> <span class="classifier">str or dict, default ‘infer’</span></dt>
<dd><p class="first">For on-the-fly decompression of on-disk data. If ‘infer’ and ‘filepath_or_buffer’ is
path-like, then detect compression from the following extensions: ‘.gz’,
‘.bz2’, ‘.zip’, ‘.xz’, ‘.zst’, ‘.tar’, ‘.tar.gz’, ‘.tar.xz’ or ‘.tar.bz2’
(otherwise no compression).
If using ‘zip’ or ‘tar’, the ZIP file must contain only one data file to be read in.
Set to <code class="docutils literal notranslate"><span class="pre">None</span></code> for no decompression.
Can also be a dict with key <code class="docutils literal notranslate"><span class="pre">'method'</span></code> set
to one of {<code class="docutils literal notranslate"><span class="pre">'zip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'gzip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'bz2'</span></code>, <code class="docutils literal notranslate"><span class="pre">'zstd'</span></code>, <code class="docutils literal notranslate"><span class="pre">'tar'</span></code>} and other
key-value pairs are forwarded to
<code class="docutils literal notranslate"><span class="pre">zipfile.ZipFile</span></code>, <code class="docutils literal notranslate"><span class="pre">gzip.GzipFile</span></code>,
<code class="docutils literal notranslate"><span class="pre">bz2.BZ2File</span></code>, <code class="docutils literal notranslate"><span class="pre">zstandard.ZstdDecompressor</span></code> or
<code class="docutils literal notranslate"><span class="pre">tarfile.TarFile</span></code>, respectively.
As an example, the following could be passed for Zstandard decompression using a
custom compression dictionary:
<code class="docutils literal notranslate"><span class="pre">compression={'method':</span> <span class="pre">'zstd',</span> <span class="pre">'dict_data':</span> <span class="pre">my_compression_dict}</span></code>.</p>
<blockquote>
<div><div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0: </span>Added support for <cite>.tar</cite> files.</p>
</div>
</div></blockquote>
<div class="last versionchanged">
<p><span class="versionmodified">Changed in version 1.4.0: </span>Zstandard support.</p>
</div>
</dd>
<dt>thousands <span class="classifier-delimiter">:</span> <span class="classifier">str, optional</span></dt>
<dd>Thousands separator.</dd>
<dt>decimal <span class="classifier-delimiter">:</span> <span class="classifier">str, default ‘.’</span></dt>
<dd>Character to recognize as decimal point (e.g. use ‘,’ for European data).</dd>
<dt>lineterminator <span class="classifier-delimiter">:</span> <span class="classifier">str (length 1), optional</span></dt>
<dd>Character to break file into lines. Only valid with C parser.</dd>
<dt>quotechar <span class="classifier-delimiter">:</span> <span class="classifier">str (length 1), optional</span></dt>
<dd>The character used to denote the start and end of a quoted item. Quoted
items can include the delimiter and it will be ignored.</dd>
<dt>quoting <span class="classifier-delimiter">:</span> <span class="classifier">int or csv.QUOTE_* instance, default 0</span></dt>
<dd>Control field quoting behavior per <code class="docutils literal notranslate"><span class="pre">csv.QUOTE_*</span></code> constants. Use one of
QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).</dd>
<dt>doublequote <span class="classifier-delimiter">:</span> <span class="classifier">bool, default <code class="docutils literal notranslate"><span class="pre">True</span></code></span></dt>
<dd>When quotechar is specified and quoting is not <code class="docutils literal notranslate"><span class="pre">QUOTE_NONE</span></code>, indicate
whether or not to interpret two consecutive quotechar elements INSIDE a
field as a single <code class="docutils literal notranslate"><span class="pre">quotechar</span></code> element.</dd>
<dt>escapechar <span class="classifier-delimiter">:</span> <span class="classifier">str (length 1), optional</span></dt>
<dd>One-character string used to escape other characters.</dd>
<dt>comment <span class="classifier-delimiter">:</span> <span class="classifier">str, optional</span></dt>
<dd>Indicates remainder of line should not be parsed. If found at the beginning
of a line, the line will be ignored altogether. This parameter must be a
single character. Like empty lines (as long as <code class="docutils literal notranslate"><span class="pre">skip_blank_lines=True</span></code>),
fully commented lines are ignored by the parameter <cite>header</cite> but not by
<cite>skiprows</cite>. For example, if <code class="docutils literal notranslate"><span class="pre">comment='#'</span></code>, parsing
<code class="docutils literal notranslate"><span class="pre">#empty\na,b,c\n1,2,3</span></code> with <code class="docutils literal notranslate"><span class="pre">header=0</span></code> will result in ‘a,b,c’ being
treated as the header.</dd>
<dt>encoding <span class="classifier-delimiter">:</span> <span class="classifier">str, optional</span></dt>
<dd><p class="first">Encoding to use for UTF when reading/writing (ex. ‘utf-8’). <a class="reference external" href="https://docs.python.org/3/library/codecs.html#standard-encodings">List of Python
standard encodings</a> .</p>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.2: </span>When <code class="docutils literal notranslate"><span class="pre">encoding</span></code> is <code class="docutils literal notranslate"><span class="pre">None</span></code>, <code class="docutils literal notranslate"><span class="pre">errors=&quot;replace&quot;</span></code> is passed to
<code class="docutils literal notranslate"><span class="pre">open()</span></code>. Otherwise, <code class="docutils literal notranslate"><span class="pre">errors=&quot;strict&quot;</span></code> is passed to <code class="docutils literal notranslate"><span class="pre">open()</span></code>.
This behavior was previously only the case for <code class="docutils literal notranslate"><span class="pre">engine=&quot;python&quot;</span></code>.</p>
</div>
<div class="last versionchanged">
<p><span class="versionmodified">Changed in version 1.3.0: </span><code class="docutils literal notranslate"><span class="pre">encoding_errors</span></code> is a new argument. <code class="docutils literal notranslate"><span class="pre">encoding</span></code> has no longer an
influence on how encoding errors are handled.</p>
</div>
</dd>
<dt>encoding_errors <span class="classifier-delimiter">:</span> <span class="classifier">str, optional, default “strict”</span></dt>
<dd><p class="first">How encoding errors are treated. <a class="reference external" href="https://docs.python.org/3/library/codecs.html#error-handlers">List of possible values</a> .</p>
<div class="last versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd>
<dt>dialect <span class="classifier-delimiter">:</span> <span class="classifier">str or csv.Dialect, optional</span></dt>
<dd>If provided, this parameter will override values (default or not) for the
following parameters: <cite>delimiter</cite>, <cite>doublequote</cite>, <cite>escapechar</cite>,
<cite>skipinitialspace</cite>, <cite>quotechar</cite>, and <cite>quoting</cite>. If it is necessary to
override values, a ParserWarning will be issued. See csv.Dialect
documentation for more details.</dd>
<dt>error_bad_lines <span class="classifier-delimiter">:</span> <span class="classifier">bool, optional, default <code class="docutils literal notranslate"><span class="pre">None</span></code></span></dt>
<dd><p class="first">Lines with too many fields (e.g. a csv line with too many commas) will by
default cause an exception to be raised, and no DataFrame will be returned.
If False, then these “bad lines” will be dropped from the DataFrame that is
returned.</p>
<div class="last deprecated">
<p><span class="versionmodified">Deprecated since version 1.3.0: </span>The <code class="docutils literal notranslate"><span class="pre">on_bad_lines</span></code> parameter should be used instead to specify behavior upon
encountering a bad line instead.</p>
</div>
</dd>
<dt>warn_bad_lines <span class="classifier-delimiter">:</span> <span class="classifier">bool, optional, default <code class="docutils literal notranslate"><span class="pre">None</span></code></span></dt>
<dd><p class="first">If error_bad_lines is False, and warn_bad_lines is True, a warning for each
“bad line” will be output.</p>
<div class="last deprecated">
<p><span class="versionmodified">Deprecated since version 1.3.0: </span>The <code class="docutils literal notranslate"><span class="pre">on_bad_lines</span></code> parameter should be used instead to specify behavior upon
encountering a bad line instead.</p>
</div>
</dd>
<dt>on_bad_lines <span class="classifier-delimiter">:</span> <span class="classifier">{‘error’, ‘warn’, ‘skip’} or callable, default ‘error’</span></dt>
<dd><p class="first">Specifies what to do upon encountering a bad line (a line with too many fields).
Allowed values are :</p>
<blockquote>
<div><ul class="simple">
<li>‘error’, raise an Exception when a bad line is encountered.</li>
<li>‘warn’, raise a warning when a bad line is encountered and skip that line.</li>
<li>‘skip’, skip bad lines without raising or warning when they are encountered.</li>
</ul>
</div></blockquote>
<div class="versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
<div class="last versionadded">
<ul>
<span class="versionmodified">New in version 1.4.0: </span><li><p class="first">callable, function with signature
<code class="docutils literal notranslate"><span class="pre">(bad_line:</span> <span class="pre">list[str])</span> <span class="pre">-&gt;</span> <span class="pre">list[str]</span> <span class="pre">|</span> <span class="pre">None</span></code> that will process a single
bad line. <code class="docutils literal notranslate"><span class="pre">bad_line</span></code> is a list of strings split by the <code class="docutils literal notranslate"><span class="pre">sep</span></code>.
If the function returns <code class="docutils literal notranslate"><span class="pre">None</span></code>, the bad line will be ignored.
If the function returns a new list of strings with more elements than
expected, a <code class="docutils literal notranslate"><span class="pre">ParserWarning</span></code> will be emitted while dropping extra elements.
Only supported when <code class="docutils literal notranslate"><span class="pre">engine=&quot;python&quot;</span></code></p>
</li>
</ul>
</div>
</dd>
<dt>delim_whitespace <span class="classifier-delimiter">:</span> <span class="classifier">bool, default False</span></dt>
<dd>Specifies whether or not whitespace (e.g. <code class="docutils literal notranslate"><span class="pre">'</span> <span class="pre">'</span></code> or <code class="docutils literal notranslate"><span class="pre">'</span>&#160;&#160;&#160; <span class="pre">'</span></code>) will be
used as the sep. Equivalent to setting <code class="docutils literal notranslate"><span class="pre">sep='\s+'</span></code>. If this option
is set to True, nothing should be passed in for the <code class="docutils literal notranslate"><span class="pre">delimiter</span></code>
parameter.</dd>
<dt>low_memory <span class="classifier-delimiter">:</span> <span class="classifier">bool, default True</span></dt>
<dd>Internally process the file in chunks, resulting in lower memory use
while parsing, but possibly mixed type inference. To ensure no mixed
types either set False, or specify the type with the <cite>dtype</cite> parameter.
Note that the entire file is read into a single DataFrame regardless,
use the <cite>chunksize</cite> or <cite>iterator</cite> parameter to return the data in chunks.
(Only valid with C parser).</dd>
<dt>memory_map <span class="classifier-delimiter">:</span> <span class="classifier">bool, default False</span></dt>
<dd>If a filepath is provided for <cite>filepath_or_buffer</cite>, map the file object
directly onto memory and access the data directly from there. Using this
option can improve performance because there is no longer any I/O overhead.</dd>
<dt>float_precision <span class="classifier-delimiter">:</span> <span class="classifier">str, optional</span></dt>
<dd><p class="first">Specifies which converter the C engine should use for floating-point
values. The options are <code class="docutils literal notranslate"><span class="pre">None</span></code> or ‘high’ for the ordinary converter,
‘legacy’ for the original lower precision pandas converter, and
‘round_trip’ for the round-trip converter.</p>
<div class="last versionchanged">
<p><span class="versionmodified">Changed in version 1.2.</span></p>
</div>
</dd>
<dt>storage_options <span class="classifier-delimiter">:</span> <span class="classifier">dict, optional</span></dt>
<dd><p class="first">Extra options that make sense for a particular storage connection, e.g.
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other
URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are
forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more
details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p>
<div class="last versionadded">
<p><span class="versionmodified">New in version 1.2.</span></p>
</div>
</dd>
</dl>
</dd></dl>
<dl class="function">
<dt id="apache_beam.io.textio.WriteToCsv">
<code class="descclassname">apache_beam.io.textio.</code><code class="descname">WriteToCsv</code><span class="sig-paren">(</span><em>path: str</em>, <em>num_shards: Optional[int] = None</em>, <em>file_naming: Optional[fileio.FileNaming] = None</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/textio.html#WriteToCsv"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.textio.WriteToCsv" title="Permalink to this definition"></a></dt>
<dd><dl class="docutils">
<dt>A PTransform for writing a schema’d PCollection as a (set of)</dt>
<dd><p class="first">comma-separated values (csv) files.</p>
<dl class="last docutils">
<dt>Args:</dt>
<dd><dl class="first docutils">
<dt>path (str): The file path to write to. The files written will</dt>
<dd>begin with this prefix, followed by a shard identifier (see
<cite>num_shards</cite>) according to the <cite>file_naming</cite> parameter.</dd>
<dt>num_shards (optional int): The number of shards to use in the distributed</dt>
<dd>write. Defaults to None, letting the system choose an optimal value.</dd>
<dt>file_naming (optional callable): A file-naming strategy, determining the</dt>
<dd>actual shard names given their shard number, etc.
See the section on <a class="reference external" href="https://beam.apache.org/releases/pydoc/current/apache_beam.io.fileio.html#file-naming">file naming</a>
Defaults to <cite>fileio.default_file_naming</cite>, which names files as
<cite>path-XXXXX-of-NNNNN</cite>.</dd>
</dl>
<p class="last"><a href="#id3"><span class="problematic" id="id4">**</span></a>kwargs: Extra arguments passed to <cite>pandas.Dataframe.to_csv</cite> (see below).</p>
</dd>
</dl>
</dd>
</dl>
<dl class="docutils">
<dt>sep <span class="classifier-delimiter">:</span> <span class="classifier">str, default ‘,’</span></dt>
<dd>String of length 1. Field delimiter for the output file.</dd>
<dt>na_rep <span class="classifier-delimiter">:</span> <span class="classifier">str, default ‘’</span></dt>
<dd>Missing data representation.</dd>
<dt>float_format <span class="classifier-delimiter">:</span> <span class="classifier">str, Callable, default None</span></dt>
<dd>Format string for floating point numbers. If a Callable is given, it takes
precedence over other numeric formatting parameters, like decimal.</dd>
<dt>columns <span class="classifier-delimiter">:</span> <span class="classifier">sequence, optional</span></dt>
<dd>Columns to write.</dd>
<dt>header <span class="classifier-delimiter">:</span> <span class="classifier">bool or list of str, default True</span></dt>
<dd>Write out the column names. If a list of strings is given it is
assumed to be aliases for the column names.</dd>
<dt>mode <span class="classifier-delimiter">:</span> <span class="classifier">str, default ‘w’</span></dt>
<dd>Python write mode. The available write modes are the same as
<a class="reference external" href="https://docs.python.org/3/library/functions.html#open" title="(in Python v3.11)"><code class="xref py py-func docutils literal notranslate"><span class="pre">open()</span></code></a>.</dd>
<dt>encoding <span class="classifier-delimiter">:</span> <span class="classifier">str, optional</span></dt>
<dd>A string representing the encoding to use in the output file,
defaults to ‘utf-8’. <cite>encoding</cite> is not supported if <cite>path_or_buf</cite>
is a non-binary file object.</dd>
<dt>compression <span class="classifier-delimiter">:</span> <span class="classifier">str or dict, default ‘infer’</span></dt>
<dd><p class="first">For on-the-fly compression of the output data. If ‘infer’ and ‘path_or_buf’ is
path-like, then detect compression from the following extensions: ‘.gz’,
‘.bz2’, ‘.zip’, ‘.xz’, ‘.zst’, ‘.tar’, ‘.tar.gz’, ‘.tar.xz’ or ‘.tar.bz2’
(otherwise no compression).
Set to <code class="docutils literal notranslate"><span class="pre">None</span></code> for no compression.
Can also be a dict with key <code class="docutils literal notranslate"><span class="pre">'method'</span></code> set
to one of {<code class="docutils literal notranslate"><span class="pre">'zip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'gzip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'bz2'</span></code>, <code class="docutils literal notranslate"><span class="pre">'zstd'</span></code>, <code class="docutils literal notranslate"><span class="pre">'tar'</span></code>} and other
key-value pairs are forwarded to
<code class="docutils literal notranslate"><span class="pre">zipfile.ZipFile</span></code>, <code class="docutils literal notranslate"><span class="pre">gzip.GzipFile</span></code>,
<code class="docutils literal notranslate"><span class="pre">bz2.BZ2File</span></code>, <code class="docutils literal notranslate"><span class="pre">zstandard.ZstdCompressor</span></code> or
<code class="docutils literal notranslate"><span class="pre">tarfile.TarFile</span></code>, respectively.
As an example, the following could be passed for faster compression and to create
a reproducible gzip archive:
<code class="docutils literal notranslate"><span class="pre">compression={'method':</span> <span class="pre">'gzip',</span> <span class="pre">'compresslevel':</span> <span class="pre">1,</span> <span class="pre">'mtime':</span> <span class="pre">1}</span></code>.</p>
<blockquote>
<div><div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0: </span>Added support for <cite>.tar</cite> files.</p>
</div>
</div></blockquote>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.0.0: </span>May now be a dict with key ‘method’ as compression mode
and other entries as additional compression options if
compression mode is ‘zip’.</p>
</div>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.1.0: </span>Passing compression options as keys in dict is
supported for compression modes ‘gzip’, ‘bz2’, ‘zstd’, and ‘zip’.</p>
</div>
<div class="versionchanged">
<p><span class="versionmodified">Changed in version 1.2.0: </span>Compression is supported for binary file objects.</p>
</div>
<div class="last versionchanged">
<p><span class="versionmodified">Changed in version 1.2.0: </span>Previous versions forwarded dict entries for ‘gzip’ to
<cite>gzip.open</cite> instead of <cite>gzip.GzipFile</cite> which prevented
setting <cite>mtime</cite>.</p>
</div>
</dd>
<dt>quoting <span class="classifier-delimiter">:</span> <span class="classifier">optional constant from csv module</span></dt>
<dd>Defaults to csv.QUOTE_MINIMAL. If you have set a <cite>float_format</cite>
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
will treat them as non-numeric.</dd>
<dt>quotechar <span class="classifier-delimiter">:</span> <span class="classifier">str, default ‘”’</span></dt>
<dd>String of length 1. Character used to quote fields.</dd>
<dt>lineterminator <span class="classifier-delimiter">:</span> <span class="classifier">str, optional</span></dt>
<dd><p class="first">The newline character or character sequence to use in the output
file. Defaults to <cite>os.linesep</cite>, which depends on the OS in which
this method is called (‘\n’ for linux, ‘\r\n’ for Windows, i.e.).</p>
<div class="last versionchanged">
<p><span class="versionmodified">Changed in version 1.5.0: </span>Previously was line_terminator, changed for consistency with
read_csv and the standard library ‘csv’ module.</p>
</div>
</dd>
<dt>chunksize <span class="classifier-delimiter">:</span> <span class="classifier">int or None</span></dt>
<dd>Rows to write at a time.</dd>
<dt>date_format <span class="classifier-delimiter">:</span> <span class="classifier">str, default None</span></dt>
<dd>Format string for datetime objects.</dd>
<dt>doublequote <span class="classifier-delimiter">:</span> <span class="classifier">bool, default True</span></dt>
<dd>Control quoting of <cite>quotechar</cite> inside a field.</dd>
<dt>escapechar <span class="classifier-delimiter">:</span> <span class="classifier">str, default None</span></dt>
<dd>String of length 1. Character used to escape <cite>sep</cite> and <cite>quotechar</cite>
when appropriate.</dd>
<dt>decimal <span class="classifier-delimiter">:</span> <span class="classifier">str, default ‘.’</span></dt>
<dd>Character recognized as decimal separator. E.g. use ‘,’ for
European data.</dd>
<dt>errors <span class="classifier-delimiter">:</span> <span class="classifier">str, default ‘strict’</span></dt>
<dd><p class="first">Specifies how encoding and decoding errors are to be handled.
See the errors argument for <a class="reference external" href="https://docs.python.org/3/library/functions.html#open" title="(in Python v3.11)"><code class="xref py py-func docutils literal notranslate"><span class="pre">open()</span></code></a> for a full list
of options.</p>
<div class="last versionadded">
<p><span class="versionmodified">New in version 1.1.0.</span></p>
</div>
</dd>
<dt>storage_options <span class="classifier-delimiter">:</span> <span class="classifier">dict, optional</span></dt>
<dd><p class="first">Extra options that make sense for a particular storage connection, e.g.
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other
URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are
forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more
details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p>
<div class="last versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd>
</dl>
</dd></dl>
<dl class="function">
<dt id="apache_beam.io.textio.ReadFromJson">
<code class="descclassname">apache_beam.io.textio.</code><code class="descname">ReadFromJson</code><span class="sig-paren">(</span><em>path: str</em>, <em>*</em>, <em>orient: str = 'records'</em>, <em>lines: bool = True</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/textio.html#ReadFromJson"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.textio.ReadFromJson" title="Permalink to this definition"></a></dt>
<dd><p>A PTransform for reading json values from files into a PCollection.</p>
<blockquote>
<div><dl class="docutils">
<dt>Args:</dt>
<dd><dl class="first docutils">
<dt>path (str): The file path to read from. The path can contain glob</dt>
<dd>characters such as <code class="docutils literal notranslate"><span class="pre">*</span></code> and <code class="docutils literal notranslate"><span class="pre">?</span></code>.</dd>
<dt>orient (str): Format of the json elements in the file.</dt>
<dd>Default to ‘records’, meaning the file is expected to contain a list
of json objects like <cite>{field1: value1, field2: value2, …}</cite>.</dd>
<dt>lines (bool): Whether each line should be considered a separate record,</dt>
<dd>as opposed to the entire file being a valid JSON object or list.
Defaults to True (unlike Pandas).</dd>
</dl>
<p class="last"><a href="#id6"><span class="problematic" id="id7">**</span></a>kwargs: Extra arguments passed to <cite>pandas.read_json</cite> (see below).</p>
</dd>
</dl>
</div></blockquote>
<dl class="docutils">
<dt>orient <span class="classifier-delimiter">:</span> <span class="classifier">str</span></dt>
<dd><p class="first">Indication of expected JSON string format.
Compatible JSON strings can be produced by <code class="docutils literal notranslate"><span class="pre">to_json()</span></code> with a
corresponding orient value.
The set of possible orients is:</p>
<ul class="simple">
<li><code class="docutils literal notranslate"><span class="pre">'split'</span></code> : dict like
<code class="docutils literal notranslate"><span class="pre">{index</span> <span class="pre">-&gt;</span> <span class="pre">[index],</span> <span class="pre">columns</span> <span class="pre">-&gt;</span> <span class="pre">[columns],</span> <span class="pre">data</span> <span class="pre">-&gt;</span> <span class="pre">[values]}</span></code></li>
<li><code class="docutils literal notranslate"><span class="pre">'records'</span></code> : list like
<code class="docutils literal notranslate"><span class="pre">[{column</span> <span class="pre">-&gt;</span> <span class="pre">value},</span> <span class="pre">...</span> <span class="pre">,</span> <span class="pre">{column</span> <span class="pre">-&gt;</span> <span class="pre">value}]</span></code></li>
<li><code class="docutils literal notranslate"><span class="pre">'index'</span></code> : dict like <code class="docutils literal notranslate"><span class="pre">{index</span> <span class="pre">-&gt;</span> <span class="pre">{column</span> <span class="pre">-&gt;</span> <span class="pre">value}}</span></code></li>
<li><code class="docutils literal notranslate"><span class="pre">'columns'</span></code> : dict like <code class="docutils literal notranslate"><span class="pre">{column</span> <span class="pre">-&gt;</span> <span class="pre">{index</span> <span class="pre">-&gt;</span> <span class="pre">value}}</span></code></li>
<li><code class="docutils literal notranslate"><span class="pre">'values'</span></code> : just the values array</li>
</ul>
<p>The allowed and default values depend on the value
of the <cite>typ</cite> parameter.</p>
<ul class="last simple">
<li>when <code class="docutils literal notranslate"><span class="pre">typ</span> <span class="pre">==</span> <span class="pre">'series'</span></code>,<ul>
<li>allowed orients are <code class="docutils literal notranslate"><span class="pre">{'split','records','index'}</span></code></li>
<li>default is <code class="docutils literal notranslate"><span class="pre">'index'</span></code></li>
<li>The Series index must be unique for orient <code class="docutils literal notranslate"><span class="pre">'index'</span></code>.</li>
</ul>
</li>
<li>when <code class="docutils literal notranslate"><span class="pre">typ</span> <span class="pre">==</span> <span class="pre">'frame'</span></code>,<ul>
<li>allowed orients are <code class="docutils literal notranslate"><span class="pre">{'split','records','index',</span>
<span class="pre">'columns','values',</span> <span class="pre">'table'}</span></code></li>
<li>default is <code class="docutils literal notranslate"><span class="pre">'columns'</span></code></li>
<li>The DataFrame index must be unique for orients <code class="docutils literal notranslate"><span class="pre">'index'</span></code> and
<code class="docutils literal notranslate"><span class="pre">'columns'</span></code>.</li>
<li>The DataFrame columns must be unique for orients <code class="docutils literal notranslate"><span class="pre">'index'</span></code>,
<code class="docutils literal notranslate"><span class="pre">'columns'</span></code>, and <code class="docutils literal notranslate"><span class="pre">'records'</span></code>.</li>
</ul>
</li>
</ul>
</dd>
<dt>typ <span class="classifier-delimiter">:</span> <span class="classifier">{‘frame’, ‘series’}, default ‘frame’</span></dt>
<dd>The type of object to recover.</dd>
<dt>dtype <span class="classifier-delimiter">:</span> <span class="classifier">bool or dict, default None</span></dt>
<dd><p class="first">If True, infer dtypes; if a dict of column to dtype, then use those;
if False, then don’t infer dtypes at all, applies only to the data.</p>
<p>For all <code class="docutils literal notranslate"><span class="pre">orient</span></code> values except <code class="docutils literal notranslate"><span class="pre">'table'</span></code>, default is True.</p>
<div class="last versionchanged">
<p><span class="versionmodified">Changed in version 0.25.0: </span>Not applicable for <code class="docutils literal notranslate"><span class="pre">orient='table'</span></code>.</p>
</div>
</dd>
<dt>convert_axes <span class="classifier-delimiter">:</span> <span class="classifier">bool, default None</span></dt>
<dd><p class="first">Try to convert the axes to the proper dtypes.</p>
<p>For all <code class="docutils literal notranslate"><span class="pre">orient</span></code> values except <code class="docutils literal notranslate"><span class="pre">'table'</span></code>, default is True.</p>
<div class="last versionchanged">
<p><span class="versionmodified">Changed in version 0.25.0: </span>Not applicable for <code class="docutils literal notranslate"><span class="pre">orient='table'</span></code>.</p>
</div>
</dd>
<dt>convert_dates <span class="classifier-delimiter">:</span> <span class="classifier">bool or list of str, default True</span></dt>
<dd>If True then default datelike columns may be converted (depending on
keep_default_dates).
If False, no dates will be converted.
If a list of column names, then those columns will be converted and
default datelike columns may also be converted (depending on
keep_default_dates).</dd>
<dt>keep_default_dates <span class="classifier-delimiter">:</span> <span class="classifier">bool, default True</span></dt>
<dd><p class="first">If parsing dates (convert_dates is not False), then try to parse the
default datelike columns.
A column label is datelike if</p>
<ul class="last simple">
<li>it ends with <code class="docutils literal notranslate"><span class="pre">'_at'</span></code>,</li>
<li>it ends with <code class="docutils literal notranslate"><span class="pre">'_time'</span></code>,</li>
<li>it begins with <code class="docutils literal notranslate"><span class="pre">'timestamp'</span></code>,</li>
<li>it is <code class="docutils literal notranslate"><span class="pre">'modified'</span></code>, or</li>
<li>it is <code class="docutils literal notranslate"><span class="pre">'date'</span></code>.</li>
</ul>
</dd>
<dt>numpy <span class="classifier-delimiter">:</span> <span class="classifier">bool, default False</span></dt>
<dd><p class="first">Direct decoding to numpy arrays. Supports numeric data only, but
non-numeric column and index labels are supported. Note also that the
JSON ordering MUST be the same for each term if numpy=True.</p>
<div class="last deprecated">
<p><span class="versionmodified">Deprecated since version 1.0.0.</span></p>
</div>
</dd>
<dt>precise_float <span class="classifier-delimiter">:</span> <span class="classifier">bool, default False</span></dt>
<dd>Set to enable usage of higher precision (strtod) function when
decoding string to double values. Default (False) is to use fast but
less precise builtin functionality.</dd>
<dt>date_unit <span class="classifier-delimiter">:</span> <span class="classifier">str, default None</span></dt>
<dd>The timestamp unit to detect if converting dates. The default behaviour
is to try and detect the correct precision, but if this is not desired
then pass one of ‘s’, ‘ms’, ‘us’ or ‘ns’ to force parsing only seconds,
milliseconds, microseconds or nanoseconds respectively.</dd>
<dt>encoding <span class="classifier-delimiter">:</span> <span class="classifier">str, default is ‘utf-8’</span></dt>
<dd>The encoding to use to decode py3 bytes.</dd>
<dt>encoding_errors <span class="classifier-delimiter">:</span> <span class="classifier">str, optional, default “strict”</span></dt>
<dd><p class="first">How encoding errors are treated. <a class="reference external" href="https://docs.python.org/3/library/codecs.html#error-handlers">List of possible values</a> .</p>
<div class="last versionadded">
<p><span class="versionmodified">New in version 1.3.0.</span></p>
</div>
</dd>
<dt>lines <span class="classifier-delimiter">:</span> <span class="classifier">bool, default False</span></dt>
<dd>Read the file as a json object per line.</dd>
<dt>chunksize <span class="classifier-delimiter">:</span> <span class="classifier">int, optional</span></dt>
<dd><p class="first">Return JsonReader object for iteration.
See the <a class="reference external" href="https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#line-delimited-json">line-delimited json docs</a>
for more information on <code class="docutils literal notranslate"><span class="pre">chunksize</span></code>.
This can only be passed if <cite>lines=True</cite>.
If this is None, the file will be read into memory all at once.</p>
<div class="last versionchanged">
<p><span class="versionmodified">Changed in version 1.2: </span><code class="docutils literal notranslate"><span class="pre">JsonReader</span></code> is a context manager.</p>
</div>
</dd>
<dt>compression <span class="classifier-delimiter">:</span> <span class="classifier">str or dict, default ‘infer’</span></dt>
<dd><p class="first">For on-the-fly decompression of on-disk data. If ‘infer’ and ‘path_or_buf’ is
path-like, then detect compression from the following extensions: ‘.gz’,
‘.bz2’, ‘.zip’, ‘.xz’, ‘.zst’, ‘.tar’, ‘.tar.gz’, ‘.tar.xz’ or ‘.tar.bz2’
(otherwise no compression).
If using ‘zip’ or ‘tar’, the ZIP file must contain only one data file to be read in.
Set to <code class="docutils literal notranslate"><span class="pre">None</span></code> for no decompression.
Can also be a dict with key <code class="docutils literal notranslate"><span class="pre">'method'</span></code> set
to one of {<code class="docutils literal notranslate"><span class="pre">'zip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'gzip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'bz2'</span></code>, <code class="docutils literal notranslate"><span class="pre">'zstd'</span></code>, <code class="docutils literal notranslate"><span class="pre">'tar'</span></code>} and other
key-value pairs are forwarded to
<code class="docutils literal notranslate"><span class="pre">zipfile.ZipFile</span></code>, <code class="docutils literal notranslate"><span class="pre">gzip.GzipFile</span></code>,
<code class="docutils literal notranslate"><span class="pre">bz2.BZ2File</span></code>, <code class="docutils literal notranslate"><span class="pre">zstandard.ZstdDecompressor</span></code> or
<code class="docutils literal notranslate"><span class="pre">tarfile.TarFile</span></code>, respectively.
As an example, the following could be passed for Zstandard decompression using a
custom compression dictionary:
<code class="docutils literal notranslate"><span class="pre">compression={'method':</span> <span class="pre">'zstd',</span> <span class="pre">'dict_data':</span> <span class="pre">my_compression_dict}</span></code>.</p>
<blockquote>
<div><div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0: </span>Added support for <cite>.tar</cite> files.</p>
</div>
</div></blockquote>
<div class="last versionchanged">
<p><span class="versionmodified">Changed in version 1.4.0: </span>Zstandard support.</p>
</div>
</dd>
<dt>nrows <span class="classifier-delimiter">:</span> <span class="classifier">int, optional</span></dt>
<dd><p class="first">The number of lines from the line-delimited jsonfile that has to be read.
This can only be passed if <cite>lines=True</cite>.
If this is None, all the rows will be returned.</p>
<div class="last versionadded">
<p><span class="versionmodified">New in version 1.1.</span></p>
</div>
</dd>
<dt>storage_options <span class="classifier-delimiter">:</span> <span class="classifier">dict, optional</span></dt>
<dd><p class="first">Extra options that make sense for a particular storage connection, e.g.
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other
URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are
forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more
details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p>
<div class="last versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd>
</dl>
</dd></dl>
<dl class="function">
<dt id="apache_beam.io.textio.WriteToJson">
<code class="descclassname">apache_beam.io.textio.</code><code class="descname">WriteToJson</code><span class="sig-paren">(</span><em>path: str</em>, <em>*</em>, <em>num_shards: Optional[int] = None</em>, <em>file_naming: Optional[fileio.FileNaming] = None</em>, <em>orient: str = 'records'</em>, <em>lines: Optional[bool] = None</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/textio.html#WriteToJson"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.textio.WriteToJson" title="Permalink to this definition"></a></dt>
<dd><p>A PTransform for writing a PCollection as json values to files.</p>
<blockquote>
<div><dl class="docutils">
<dt>Args:</dt>
<dd><dl class="first last docutils">
<dt>path (str): The file path to write to. The files written will</dt>
<dd>begin with this prefix, followed by a shard identifier (see
<cite>num_shards</cite>) according to the <cite>file_naming</cite> parameter.</dd>
<dt>num_shards (optional int): The number of shards to use in the distributed</dt>
<dd>write. Defaults to None, letting the system choose an optimal value.</dd>
<dt>file_naming (optional callable): A file-naming strategy, determining the</dt>
<dd>actual shard names given their shard number, etc.
See the section on <a class="reference external" href="https://beam.apache.org/releases/pydoc/current/apache_beam.io.fileio.html#file-naming">file naming</a>
Defaults to <cite>fileio.default_file_naming</cite>, which names files as
<cite>path-XXXXX-of-NNNNN</cite>.</dd>
<dt>orient (str): Format of the json elements in the file.</dt>
<dd>Default to ‘records’, meaning the file will to contain a list
of json objects like <cite>{field1: value1, field2: value2, …}</cite>.</dd>
<dt>lines (bool): Whether each line should be considered a separate record,</dt>
<dd>as opposed to the entire file being a valid JSON object or list.
Defaults to True if orient is ‘records’ (unlike Pandas).</dd>
<dt><a href="#id11"><span class="problematic" id="id12">**</span></a>kwargs: Extra arguments passed to <cite>pandas.Dataframe.to_json</cite></dt>
<dd>(see below).</dd>
</dl>
</dd>
</dl>
</div></blockquote>
<dl class="docutils">
<dt>orient <span class="classifier-delimiter">:</span> <span class="classifier">str</span></dt>
<dd><p class="first">Indication of expected JSON string format.</p>
<ul class="last">
<li><p class="first">Series:</p>
<blockquote>
<div><ul class="simple">
<li>default is ‘index’</li>
<li>allowed values are: {‘split’, ‘records’, ‘index’, ‘table’}.</li>
</ul>
</div></blockquote>
</li>
<li><p class="first">DataFrame:</p>
<blockquote>
<div><ul class="simple">
<li>default is ‘columns’</li>
<li>allowed values are: {‘split’, ‘records’, ‘index’, ‘columns’,
‘values’, ‘table’}.</li>
</ul>
</div></blockquote>
</li>
<li><p class="first">The format of the JSON string:</p>
<blockquote>
<div><ul class="simple">
<li>‘split’ : dict like {‘index’ -&gt; [index], ‘columns’ -&gt; [columns],
‘data’ -&gt; [values]}</li>
<li>‘records’ : list like [{column -&gt; value}, … , {column -&gt; value}]</li>
<li>‘index’ : dict like {index -&gt; {column -&gt; value}}</li>
<li>‘columns’ : dict like {column -&gt; {index -&gt; value}}</li>
<li>‘values’ : just the values array</li>
<li>‘table’ : dict like {‘schema’: {schema}, ‘data’: {data}}</li>
</ul>
<p>Describing the data, where data component is like <code class="docutils literal notranslate"><span class="pre">orient='records'</span></code>.</p>
</div></blockquote>
</li>
</ul>
</dd>
<dt>date_format <span class="classifier-delimiter">:</span> <span class="classifier">{None, ‘epoch’, ‘iso’}</span></dt>
<dd>Type of date conversion. ‘epoch’ = epoch milliseconds,
‘iso’ = ISO8601. The default depends on the <cite>orient</cite>. For
<code class="docutils literal notranslate"><span class="pre">orient='table'</span></code>, the default is ‘iso’. For all other orients,
the default is ‘epoch’.</dd>
<dt>double_precision <span class="classifier-delimiter">:</span> <span class="classifier">int, default 10</span></dt>
<dd>The number of decimal places to use when encoding
floating point values.</dd>
<dt>force_ascii <span class="classifier-delimiter">:</span> <span class="classifier">bool, default True</span></dt>
<dd>Force encoded string to be ASCII.</dd>
<dt>date_unit <span class="classifier-delimiter">:</span> <span class="classifier">str, default ‘ms’ (milliseconds)</span></dt>
<dd>The time unit to encode to, governs timestamp and ISO8601
precision. One of ‘s’, ‘ms’, ‘us’, ‘ns’ for second, millisecond,
microsecond, and nanosecond respectively.</dd>
<dt>default_handler <span class="classifier-delimiter">:</span> <span class="classifier">callable, default None</span></dt>
<dd>Handler to call if object cannot otherwise be converted to a
suitable format for JSON. Should receive a single argument which is
the object to convert and return a serialisable object.</dd>
<dt>lines <span class="classifier-delimiter">:</span> <span class="classifier">bool, default False</span></dt>
<dd>If ‘orient’ is ‘records’ write out line-delimited json format. Will
throw ValueError if incorrect ‘orient’ since others are not
list-like.</dd>
<dt>compression <span class="classifier-delimiter">:</span> <span class="classifier">str or dict, default ‘infer’</span></dt>
<dd><p class="first">For on-the-fly compression of the output data. If ‘infer’ and ‘path_or_buf’ is
path-like, then detect compression from the following extensions: ‘.gz’,
‘.bz2’, ‘.zip’, ‘.xz’, ‘.zst’, ‘.tar’, ‘.tar.gz’, ‘.tar.xz’ or ‘.tar.bz2’
(otherwise no compression).
Set to <code class="docutils literal notranslate"><span class="pre">None</span></code> for no compression.
Can also be a dict with key <code class="docutils literal notranslate"><span class="pre">'method'</span></code> set
to one of {<code class="docutils literal notranslate"><span class="pre">'zip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'gzip'</span></code>, <code class="docutils literal notranslate"><span class="pre">'bz2'</span></code>, <code class="docutils literal notranslate"><span class="pre">'zstd'</span></code>, <code class="docutils literal notranslate"><span class="pre">'tar'</span></code>} and other
key-value pairs are forwarded to
<code class="docutils literal notranslate"><span class="pre">zipfile.ZipFile</span></code>, <code class="docutils literal notranslate"><span class="pre">gzip.GzipFile</span></code>,
<code class="docutils literal notranslate"><span class="pre">bz2.BZ2File</span></code>, <code class="docutils literal notranslate"><span class="pre">zstandard.ZstdCompressor</span></code> or
<code class="docutils literal notranslate"><span class="pre">tarfile.TarFile</span></code>, respectively.
As an example, the following could be passed for faster compression and to create
a reproducible gzip archive:
<code class="docutils literal notranslate"><span class="pre">compression={'method':</span> <span class="pre">'gzip',</span> <span class="pre">'compresslevel':</span> <span class="pre">1,</span> <span class="pre">'mtime':</span> <span class="pre">1}</span></code>.</p>
<blockquote>
<div><div class="versionadded">
<p><span class="versionmodified">New in version 1.5.0: </span>Added support for <cite>.tar</cite> files.</p>
</div>
</div></blockquote>
<div class="last versionchanged">
<p><span class="versionmodified">Changed in version 1.4.0: </span>Zstandard support.</p>
</div>
</dd>
<dt>indent <span class="classifier-delimiter">:</span> <span class="classifier">int, optional</span></dt>
<dd><p class="first">Length of whitespace used to indent each record.</p>
<div class="last versionadded">
<p><span class="versionmodified">New in version 1.0.0.</span></p>
</div>
</dd>
<dt>storage_options <span class="classifier-delimiter">:</span> <span class="classifier">dict, optional</span></dt>
<dd><p class="first">Extra options that make sense for a particular storage connection, e.g.
host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
are forwarded to <code class="docutils literal notranslate"><span class="pre">urllib.request.Request</span></code> as header options. For other
URLs (e.g. starting with “s3://”, and “gcs://”) the key-value pairs are
forwarded to <code class="docutils literal notranslate"><span class="pre">fsspec.open</span></code>. Please see <code class="docutils literal notranslate"><span class="pre">fsspec</span></code> and <code class="docutils literal notranslate"><span class="pre">urllib</span></code> for more
details, and for more examples on storage options refer <a class="reference external" href="https://pandas.pydata.org/docs/user_guide/io.html?highlight=storage_options#reading-writing-remote-files">here</a>.</p>
<div class="last versionadded">
<p><span class="versionmodified">New in version 1.2.0.</span></p>
</div>
</dd>
</dl>
</dd></dl>
</div>
</div>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="apache_beam.io.tfrecordio.html" class="btn btn-neutral float-right" title="apache_beam.io.tfrecordio module" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
<a href="apache_beam.io.source_test_utils.html" class="btn btn-neutral float-left" title="apache_beam.io.source_test_utils module" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
</div>
<hr/>
<div role="contentinfo">
<p>
&copy; Copyright
</p>
</div>
Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script type="text/javascript">
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>