

<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
  <meta charset="utf-8">
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
  <title>apache_beam.io.parquetio module &mdash; Apache Beam 2.36.0 documentation</title>
  

  
  
  
  

  
  <script type="text/javascript" src="_static/js/modernizr.min.js"></script>
  
    
      <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
        <script type="text/javascript" src="_static/jquery.js"></script>
        <script type="text/javascript" src="_static/underscore.js"></script>
        <script type="text/javascript" src="_static/doctools.js"></script>
        <script type="text/javascript" src="_static/language_data.js"></script>
        <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
    
    <script type="text/javascript" src="_static/js/theme.js"></script>

    

  
  <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
    <link rel="index" title="Index" href="genindex.html" />
    <link rel="search" title="Search" href="search.html" />
    <link rel="next" title="apache_beam.io.range_trackers module" href="apache_beam.io.range_trackers.html" />
    <link rel="prev" title="apache_beam.io.mongodbio module" href="apache_beam.io.mongodbio.html" /> 
</head>

<body class="wy-body-for-nav">

   
  <div class="wy-grid-for-nav">
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >
          

          
            <a href="index.html" class="icon icon-home"> Apache Beam
          

          
          </a>

          
            
            
              <div class="version">
                2.36.0
              </div>
            
          

          
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>

          
        </div>

        <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
          
            
            
              
            
            
              <ul class="current">
<li class="toctree-l1"><a class="reference internal" href="apache_beam.coders.html">apache_beam.coders package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.dataframe.html">apache_beam.dataframe package</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="apache_beam.io.html">apache_beam.io package</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="apache_beam.io.html#subpackages">Subpackages</a></li>
<li class="toctree-l2 current"><a class="reference internal" href="apache_beam.io.html#submodules">Submodules</a><ul class="current">
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.avroio.html">apache_beam.io.avroio module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.concat_source.html">apache_beam.io.concat_source module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.debezium.html">apache_beam.io.debezium module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.filebasedsink.html">apache_beam.io.filebasedsink module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.filebasedsource.html">apache_beam.io.filebasedsource module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.fileio.html">apache_beam.io.fileio module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.filesystem.html">apache_beam.io.filesystem module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.filesystemio.html">apache_beam.io.filesystemio module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.filesystems.html">apache_beam.io.filesystems module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.hadoopfilesystem.html">apache_beam.io.hadoopfilesystem module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.iobase.html">apache_beam.io.iobase module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.jdbc.html">apache_beam.io.jdbc module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.kafka.html">apache_beam.io.kafka module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.kinesis.html">apache_beam.io.kinesis module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.localfilesystem.html">apache_beam.io.localfilesystem module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.mongodbio.html">apache_beam.io.mongodbio module</a></li>
<li class="toctree-l3 current"><a class="current reference internal" href="#">apache_beam.io.parquetio module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.range_trackers.html">apache_beam.io.range_trackers module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.restriction_trackers.html">apache_beam.io.restriction_trackers module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.snowflake.html">apache_beam.io.snowflake module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.source_test_utils.html">apache_beam.io.source_test_utils module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.textio.html">apache_beam.io.textio module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.tfrecordio.html">apache_beam.io.tfrecordio module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.utils.html">apache_beam.io.utils module</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.io.watermark_estimators.html">apache_beam.io.watermark_estimators module</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.metrics.html">apache_beam.metrics package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.ml.html">apache_beam.ml package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.options.html">apache_beam.options package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.portability.html">apache_beam.portability package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.runners.html">apache_beam.runners package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.transforms.html">apache_beam.transforms package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.typehints.html">apache_beam.typehints package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.utils.html">apache_beam.utils package</a></li>
</ul>
<ul>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.error.html">apache_beam.error module</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.pipeline.html">apache_beam.pipeline module</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.pvalue.html">apache_beam.pvalue module</a></li>
</ul>

            
          
        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">

      
      <nav class="wy-nav-top" aria-label="top navigation">
        
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="index.html">Apache Beam</a>
        
      </nav>


      <div class="wy-nav-content">
        
        <div class="rst-content">
        
          















<div role="navigation" aria-label="breadcrumbs navigation">

  <ul class="wy-breadcrumbs">
    
      <li><a href="index.html">Docs</a> &raquo;</li>
        
          <li><a href="apache_beam.io.html">apache_beam.io package</a> &raquo;</li>
        
      <li>apache_beam.io.parquetio module</li>
    
    
      <li class="wy-breadcrumbs-aside">
        
            
            <a href="_sources/apache_beam.io.parquetio.rst.txt" rel="nofollow"> View page source</a>
          
        
      </li>
    
  </ul>

  
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
            
  <div class="section" id="module-apache_beam.io.parquetio">
<span id="apache-beam-io-parquetio-module"></span><h1>apache_beam.io.parquetio module<a class="headerlink" href="#module-apache_beam.io.parquetio" title="Permalink to this headline">¶</a></h1>
<p><code class="docutils literal notranslate"><span class="pre">PTransforms</span></code> for reading from and writing to Parquet files.</p>
<p>Provides two read <code class="docutils literal notranslate"><span class="pre">PTransform</span></code>s, <code class="docutils literal notranslate"><span class="pre">ReadFromParquet</span></code> and
<code class="docutils literal notranslate"><span class="pre">ReadAllFromParquet</span></code>, that produces a <code class="docutils literal notranslate"><span class="pre">PCollection</span></code> of records.
Each record of this <code class="docutils literal notranslate"><span class="pre">PCollection</span></code> will contain a single record read from
a Parquet file. Records that are of simple types will be mapped into
corresponding Python types. The actual parquet file operations are done by
pyarrow. Source splitting is supported at row group granularity.</p>
<p>Additionally, this module provides a write <code class="docutils literal notranslate"><span class="pre">PTransform</span></code> <code class="docutils literal notranslate"><span class="pre">WriteToParquet</span></code>
that can be used to write a given <code class="docutils literal notranslate"><span class="pre">PCollection</span></code> of Python objects to a
Parquet file.</p>
<dl class="class">
<dt id="apache_beam.io.parquetio.ReadFromParquetBatched">
<em class="property">class </em><code class="descclassname">apache_beam.io.parquetio.</code><code class="descname">ReadFromParquetBatched</code><span class="sig-paren">(</span><em>file_pattern=None</em>, <em>min_bundle_size=0</em>, <em>validate=True</em>, <em>columns=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/parquetio.html#ReadFromParquetBatched"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.parquetio.ReadFromParquetBatched" title="Permalink to this definition">¶</a></dt>
<dd><p>Bases: <a class="reference internal" href="apache_beam.transforms.ptransform.html#apache_beam.transforms.ptransform.PTransform" title="apache_beam.transforms.ptransform.PTransform"><code class="xref py py-class docutils literal notranslate"><span class="pre">apache_beam.transforms.ptransform.PTransform</span></code></a></p>
<p>A <a class="reference internal" href="apache_beam.transforms.ptransform.html#apache_beam.transforms.ptransform.PTransform" title="apache_beam.transforms.ptransform.PTransform"><code class="xref py py-class docutils literal notranslate"><span class="pre">PTransform</span></code></a> for reading
Parquet files as a <cite>PCollection</cite> of <cite>pyarrow.Table</cite>. This <cite>PTransform</cite> is
currently experimental. No backward-compatibility guarantees.</p>
<p>Initializes <a class="reference internal" href="#apache_beam.io.parquetio.ReadFromParquetBatched" title="apache_beam.io.parquetio.ReadFromParquetBatched"><code class="xref py py-class docutils literal notranslate"><span class="pre">ReadFromParquetBatched</span></code></a></p>
<p>An alternative to <a class="reference internal" href="#apache_beam.io.parquetio.ReadFromParquet" title="apache_beam.io.parquetio.ReadFromParquet"><code class="xref py py-class docutils literal notranslate"><span class="pre">ReadFromParquet</span></code></a> that yields each row group from
the Parquet file as a <cite>pyarrow.Table</cite>.  These Table instances can be
processed directly, or converted to a pandas DataFrame for processing.  For
more information on supported types and schema, please see the pyarrow
documentation.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">with</span> <span class="n">beam</span><span class="o">.</span><span class="n">Pipeline</span><span class="p">()</span> <span class="k">as</span> <span class="n">p</span><span class="p">:</span>
  <span class="n">dataframes</span> <span class="o">=</span> <span class="n">p</span> \
      <span class="o">|</span> <span class="s1">&#39;Read&#39;</span> <span class="o">&gt;&gt;</span> <span class="n">beam</span><span class="o">.</span><span class="n">io</span><span class="o">.</span><span class="n">ReadFromParquetBatched</span><span class="p">(</span><span class="s1">&#39;/mypath/mypqfiles*&#39;</span><span class="p">)</span> \
      <span class="o">|</span> <span class="s1">&#39;Convert to pandas&#39;</span> <span class="o">&gt;&gt;</span> <span class="n">beam</span><span class="o">.</span><span class="n">Map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">table</span><span class="p">:</span> <span class="n">table</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">())</span>
</pre></div>
</div>
<p>See also: <a class="reference internal" href="#apache_beam.io.parquetio.ReadFromParquet" title="apache_beam.io.parquetio.ReadFromParquet"><code class="xref py py-class docutils literal notranslate"><span class="pre">ReadFromParquet</span></code></a>.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>file_pattern</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a>) – the file glob to read</li>
<li><strong>min_bundle_size</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a>) – the minimum size in bytes, to be considered when
splitting the input into bundles.</li>
<li><strong>validate</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a>) – flag to verify that the files exist during the pipeline
creation time.</li>
<li><strong>columns</strong> (<em>List</em><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>]</em>) – list of columns that will be read from files.
A column name may be a prefix of a nested field, e.g. ‘a’ will select
‘a.b’, ‘a.c’, and ‘a.d.e’</li>
</ul>
</td>
</tr>
</tbody>
</table>
<dl class="method">
<dt id="apache_beam.io.parquetio.ReadFromParquetBatched.expand">
<code class="descname">expand</code><span class="sig-paren">(</span><em>pvalue</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/parquetio.html#ReadFromParquetBatched.expand"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.parquetio.ReadFromParquetBatched.expand" title="Permalink to this definition">¶</a></dt>
<dd></dd></dl>

<dl class="method">
<dt id="apache_beam.io.parquetio.ReadFromParquetBatched.display_data">
<code class="descname">display_data</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/parquetio.html#ReadFromParquetBatched.display_data"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.parquetio.ReadFromParquetBatched.display_data" title="Permalink to this definition">¶</a></dt>
<dd></dd></dl>

</dd></dl>

<dl class="class">
<dt id="apache_beam.io.parquetio.ReadFromParquet">
<em class="property">class </em><code class="descclassname">apache_beam.io.parquetio.</code><code class="descname">ReadFromParquet</code><span class="sig-paren">(</span><em>file_pattern=None</em>, <em>min_bundle_size=0</em>, <em>validate=True</em>, <em>columns=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/parquetio.html#ReadFromParquet"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.parquetio.ReadFromParquet" title="Permalink to this definition">¶</a></dt>
<dd><p>Bases: <a class="reference internal" href="apache_beam.transforms.ptransform.html#apache_beam.transforms.ptransform.PTransform" title="apache_beam.transforms.ptransform.PTransform"><code class="xref py py-class docutils literal notranslate"><span class="pre">apache_beam.transforms.ptransform.PTransform</span></code></a></p>
<p>A <a class="reference internal" href="apache_beam.transforms.ptransform.html#apache_beam.transforms.ptransform.PTransform" title="apache_beam.transforms.ptransform.PTransform"><code class="xref py py-class docutils literal notranslate"><span class="pre">PTransform</span></code></a> for reading
Parquet files as a <cite>PCollection</cite> of dictionaries. This <cite>PTransform</cite> is
currently experimental. No backward-compatibility guarantees.</p>
<p>Initializes <a class="reference internal" href="#apache_beam.io.parquetio.ReadFromParquet" title="apache_beam.io.parquetio.ReadFromParquet"><code class="xref py py-class docutils literal notranslate"><span class="pre">ReadFromParquet</span></code></a>.</p>
<p>Uses source <code class="docutils literal notranslate"><span class="pre">_ParquetSource</span></code> to read a set of Parquet files defined by
a given file pattern.</p>
<p>If <code class="docutils literal notranslate"><span class="pre">/mypath/myparquetfiles*</span></code> is a file-pattern that points to a set of
Parquet files, a <a class="reference internal" href="apache_beam.pvalue.html#apache_beam.pvalue.PCollection" title="apache_beam.pvalue.PCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">PCollection</span></code></a> for the records in
these Parquet files can be created in the following manner.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">with</span> <span class="n">beam</span><span class="o">.</span><span class="n">Pipeline</span><span class="p">()</span> <span class="k">as</span> <span class="n">p</span><span class="p">:</span>
  <span class="n">records</span> <span class="o">=</span> <span class="n">p</span> <span class="o">|</span> <span class="s1">&#39;Read&#39;</span> <span class="o">&gt;&gt;</span> <span class="n">beam</span><span class="o">.</span><span class="n">io</span><span class="o">.</span><span class="n">ReadFromParquet</span><span class="p">(</span><span class="s1">&#39;/mypath/mypqfiles*&#39;</span><span class="p">)</span>
</pre></div>
</div>
<p>Each element of this <a class="reference internal" href="apache_beam.pvalue.html#apache_beam.pvalue.PCollection" title="apache_beam.pvalue.PCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">PCollection</span></code></a> will contain
a Python dictionary representing a single record. The keys will be of type
<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><code class="xref py py-class docutils literal notranslate"><span class="pre">str</span></code></a> and named after their corresponding column names. The values
will be of the type defined in the corresponding Parquet schema. Records
that are of simple types will be mapped into corresponding Python types.
Records that are of complex types like list and struct will be mapped to
Python list and dictionary respectively. For more information on supported
types and schema, please see the pyarrow documentation.</p>
<p>See also: <a class="reference internal" href="#apache_beam.io.parquetio.ReadFromParquetBatched" title="apache_beam.io.parquetio.ReadFromParquetBatched"><code class="xref py py-class docutils literal notranslate"><span class="pre">ReadFromParquetBatched</span></code></a>.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>file_pattern</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a>) – the file glob to read</li>
<li><strong>min_bundle_size</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.10)"><em>int</em></a>) – the minimum size in bytes, to be considered when
splitting the input into bundles.</li>
<li><strong>validate</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.10)"><em>bool</em></a>) – flag to verify that the files exist during the pipeline
creation time.</li>
<li><strong>columns</strong> (<em>List</em><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.10)"><em>str</em></a><em>]</em>) – list of columns that will be read from files.
A column name may be a prefix of a nested field, e.g. ‘a’ will select
‘a.b’, ‘a.c’, and ‘a.d.e’</li>
</ul>
</td>
</tr>
</tbody>
</table>
<dl class="method">
<dt id="apache_beam.io.parquetio.ReadFromParquet.expand">
<code class="descname">expand</code><span class="sig-paren">(</span><em>pvalue</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/parquetio.html#ReadFromParquet.expand"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.parquetio.ReadFromParquet.expand" title="Permalink to this definition">¶</a></dt>
<dd></dd></dl>

<dl class="method">
<dt id="apache_beam.io.parquetio.ReadFromParquet.display_data">
<code class="descname">display_data</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/parquetio.html#ReadFromParquet.display_data"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.parquetio.ReadFromParquet.display_data" title="Permalink to this definition">¶</a></dt>
<dd></dd></dl>

</dd></dl>

<dl class="class">
<dt id="apache_beam.io.parquetio.ReadAllFromParquetBatched">
<em class="property">class </em><code class="descclassname">apache_beam.io.parquetio.</code><code class="descname">ReadAllFromParquetBatched</code><span class="sig-paren">(</span><em>min_bundle_size=0</em>, <em>desired_bundle_size=67108864</em>, <em>columns=None</em>, <em>with_filename=False</em>, <em>label='ReadAllFiles'</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/parquetio.html#ReadAllFromParquetBatched"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.parquetio.ReadAllFromParquetBatched" title="Permalink to this definition">¶</a></dt>
<dd><p>Bases: <a class="reference internal" href="apache_beam.transforms.ptransform.html#apache_beam.transforms.ptransform.PTransform" title="apache_beam.transforms.ptransform.PTransform"><code class="xref py py-class docutils literal notranslate"><span class="pre">apache_beam.transforms.ptransform.PTransform</span></code></a></p>
<p>A <code class="docutils literal notranslate"><span class="pre">PTransform</span></code> for reading <code class="docutils literal notranslate"><span class="pre">PCollection</span></code> of Parquet files.</p>
<p>Uses source <code class="docutils literal notranslate"><span class="pre">_ParquetSource</span></code> to read a <code class="docutils literal notranslate"><span class="pre">PCollection</span></code> of Parquet files or
file patterns and produce a <code class="docutils literal notranslate"><span class="pre">PCollection</span></code> of <code class="docutils literal notranslate"><span class="pre">pyarrow.Table</span></code>, one for
each Parquet file row group. This <code class="docutils literal notranslate"><span class="pre">PTransform</span></code> is currently experimental.
No backward-compatibility guarantees.</p>
<p>Initializes <code class="docutils literal notranslate"><span class="pre">ReadAllFromParquet</span></code>.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>min_bundle_size</strong> – the minimum size in bytes, to be considered when
splitting the input into bundles.</li>
<li><strong>desired_bundle_size</strong> – the desired size in bytes, to be considered when
splitting the input into bundles.</li>
<li><strong>columns</strong> – list of columns that will be read from files. A column name
may be a prefix of a nested field, e.g. ‘a’ will select
‘a.b’, ‘a.c’, and ‘a.d.e’</li>
<li><strong>with_filename</strong> – If True, returns a Key Value with the key being the file
name and the value being the actual data. If False, it only returns
the data.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<dl class="attribute">
<dt id="apache_beam.io.parquetio.ReadAllFromParquetBatched.DEFAULT_DESIRED_BUNDLE_SIZE">
<code class="descname">DEFAULT_DESIRED_BUNDLE_SIZE</code><em class="property"> = 67108864</em><a class="headerlink" href="#apache_beam.io.parquetio.ReadAllFromParquetBatched.DEFAULT_DESIRED_BUNDLE_SIZE" title="Permalink to this definition">¶</a></dt>
<dd></dd></dl>

<dl class="method">
<dt id="apache_beam.io.parquetio.ReadAllFromParquetBatched.expand">
<code class="descname">expand</code><span class="sig-paren">(</span><em>pvalue</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/parquetio.html#ReadAllFromParquetBatched.expand"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.parquetio.ReadAllFromParquetBatched.expand" title="Permalink to this definition">¶</a></dt>
<dd></dd></dl>

</dd></dl>

<dl class="class">
<dt id="apache_beam.io.parquetio.ReadAllFromParquet">
<em class="property">class </em><code class="descclassname">apache_beam.io.parquetio.</code><code class="descname">ReadAllFromParquet</code><span class="sig-paren">(</span><em>with_filename=False</em>, <em>**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/parquetio.html#ReadAllFromParquet"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.parquetio.ReadAllFromParquet" title="Permalink to this definition">¶</a></dt>
<dd><p>Bases: <a class="reference internal" href="apache_beam.transforms.ptransform.html#apache_beam.transforms.ptransform.PTransform" title="apache_beam.transforms.ptransform.PTransform"><code class="xref py py-class docutils literal notranslate"><span class="pre">apache_beam.transforms.ptransform.PTransform</span></code></a></p>
<dl class="method">
<dt id="apache_beam.io.parquetio.ReadAllFromParquet.expand">
<code class="descname">expand</code><span class="sig-paren">(</span><em>pvalue</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/parquetio.html#ReadAllFromParquet.expand"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.parquetio.ReadAllFromParquet.expand" title="Permalink to this definition">¶</a></dt>
<dd></dd></dl>

</dd></dl>

<dl class="class">
<dt id="apache_beam.io.parquetio.WriteToParquet">
<em class="property">class </em><code class="descclassname">apache_beam.io.parquetio.</code><code class="descname">WriteToParquet</code><span class="sig-paren">(</span><em>file_path_prefix</em>, <em>schema</em>, <em>row_group_buffer_size=67108864</em>, <em>record_batch_size=1000</em>, <em>codec='none'</em>, <em>use_deprecated_int96_timestamps=False</em>, <em>file_name_suffix=''</em>, <em>num_shards=0</em>, <em>shard_name_template=None</em>, <em>mime_type='application/x-parquet'</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/parquetio.html#WriteToParquet"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.parquetio.WriteToParquet" title="Permalink to this definition">¶</a></dt>
<dd><p>Bases: <a class="reference internal" href="apache_beam.transforms.ptransform.html#apache_beam.transforms.ptransform.PTransform" title="apache_beam.transforms.ptransform.PTransform"><code class="xref py py-class docutils literal notranslate"><span class="pre">apache_beam.transforms.ptransform.PTransform</span></code></a></p>
<p>A <code class="docutils literal notranslate"><span class="pre">PTransform</span></code> for writing parquet files.</p>
<p>This <code class="docutils literal notranslate"><span class="pre">PTransform</span></code> is currently experimental. No backward-compatibility
guarantees.</p>
<p>Initialize a WriteToParquet transform.</p>
<p>Writes parquet files from a <a class="reference internal" href="apache_beam.pvalue.html#apache_beam.pvalue.PCollection" title="apache_beam.pvalue.PCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">PCollection</span></code></a> of
records. Each record is a dictionary with keys of a string type that
represent column names. Schema must be specified like the example below.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">with</span> <span class="n">beam</span><span class="o">.</span><span class="n">Pipeline</span><span class="p">()</span> <span class="k">as</span> <span class="n">p</span><span class="p">:</span>
  <span class="n">records</span> <span class="o">=</span> <span class="n">p</span> <span class="o">|</span> <span class="s1">&#39;Read&#39;</span> <span class="o">&gt;&gt;</span> <span class="n">beam</span><span class="o">.</span><span class="n">Create</span><span class="p">(</span>
      <span class="p">[{</span><span class="s1">&#39;name&#39;</span><span class="p">:</span> <span class="s1">&#39;foo&#39;</span><span class="p">,</span> <span class="s1">&#39;age&#39;</span><span class="p">:</span> <span class="mi">10</span><span class="p">},</span> <span class="p">{</span><span class="s1">&#39;name&#39;</span><span class="p">:</span> <span class="s1">&#39;bar&#39;</span><span class="p">,</span> <span class="s1">&#39;age&#39;</span><span class="p">:</span> <span class="mi">20</span><span class="p">}]</span>
  <span class="p">)</span>
  <span class="n">_</span> <span class="o">=</span> <span class="n">records</span> <span class="o">|</span> <span class="s1">&#39;Write&#39;</span> <span class="o">&gt;&gt;</span> <span class="n">beam</span><span class="o">.</span><span class="n">io</span><span class="o">.</span><span class="n">WriteToParquet</span><span class="p">(</span><span class="n">filename</span><span class="p">,</span>
      <span class="n">pyarrow</span><span class="o">.</span><span class="n">schema</span><span class="p">(</span>
          <span class="p">[(</span><span class="s1">&#39;name&#39;</span><span class="p">,</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">binary</span><span class="p">()),</span> <span class="p">(</span><span class="s1">&#39;age&#39;</span><span class="p">,</span> <span class="n">pyarrow</span><span class="o">.</span><span class="n">int64</span><span class="p">())]</span>
      <span class="p">)</span>
  <span class="p">)</span>
</pre></div>
</div>
<p>For more information on supported types and schema, please see the pyarrow
document.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first simple">
<li><strong>file_path_prefix</strong> – The file path to write to. The files written will begin
with this prefix, followed by a shard identifier (see num_shards), and
end in a common extension, if given by file_name_suffix. In most cases,
only this argument is specified and num_shards, shard_name_template, and
file_name_suffix use default values.</li>
<li><strong>schema</strong> – The schema to use, as type of <code class="docutils literal notranslate"><span class="pre">pyarrow.Schema</span></code>.</li>
<li><strong>row_group_buffer_size</strong> – The byte size of the row group buffer. Note that
this size is for uncompressed data on the memory and normally much
bigger than the actual row group size written to a file.</li>
<li><strong>record_batch_size</strong> – The number of records in each record batch. Record
batch is a basic unit used for storing data in the row group buffer.
A higher record batch size implies low granularity on a row group buffer
size. For configuring a row group size based on the number of records,
set <code class="docutils literal notranslate"><span class="pre">row_group_buffer_size</span></code> to 1 and use <code class="docutils literal notranslate"><span class="pre">record_batch_size</span></code> to
adjust the value.</li>
<li><strong>codec</strong> – The codec to use for block-level compression. Any string supported
by the pyarrow specification is accepted.</li>
<li><strong>use_deprecated_int96_timestamps</strong> – Write nanosecond resolution timestamps to
INT96 Parquet format. Defaults to False.</li>
<li><strong>file_name_suffix</strong> – Suffix for the files written.</li>
<li><strong>num_shards</strong> – The number of files (shards) used for output. If not set, the
service will decide on the optimal number of shards.
Constraining the number of shards is likely to reduce
the performance of a pipeline.  Setting this value is not recommended
unless you require a specific number of output files.</li>
<li><strong>shard_name_template</strong> – A template string containing placeholders for
the shard number and shard count. When constructing a filename for a
particular shard number, the upper-case letters ‘S’ and ‘N’ are
replaced with the 0-padded shard number and shard count respectively.
This argument can be ‘’ in which case it behaves as if num_shards was
set to 1 and only one file will be generated. The default pattern used
is ‘-SSSSS-of-NNNNN’ if None is passed as the shard_name_template.</li>
<li><strong>mime_type</strong> – The MIME type to use for the produced files, if the filesystem
supports specifying MIME types.</li>
</ul>
</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body"><p class="first last">A WriteToParquet transform usable for writing.</p>
</td>
</tr>
</tbody>
</table>
<dl class="method">
<dt id="apache_beam.io.parquetio.WriteToParquet.expand">
<code class="descname">expand</code><span class="sig-paren">(</span><em>pcoll</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/parquetio.html#WriteToParquet.expand"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.parquetio.WriteToParquet.expand" title="Permalink to this definition">¶</a></dt>
<dd></dd></dl>

<dl class="method">
<dt id="apache_beam.io.parquetio.WriteToParquet.display_data">
<code class="descname">display_data</code><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/io/parquetio.html#WriteToParquet.display_data"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.io.parquetio.WriteToParquet.display_data" title="Permalink to this definition">¶</a></dt>
<dd></dd></dl>

</dd></dl>

</div>


           </div>
           
          </div>
          <footer>
  
    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
      
        <a href="apache_beam.io.range_trackers.html" class="btn btn-neutral float-right" title="apache_beam.io.range_trackers module" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
        <a href="apache_beam.io.mongodbio.html" class="btn btn-neutral float-left" title="apache_beam.io.mongodbio module" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  

  <hr/>

  <div role="contentinfo">
    <p>
        &copy; Copyright 

    </p>
  </div>
  Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. 

</footer>

        </div>
      </div>

    </section>

  </div>
  


  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script>

  
  
    
   

</body>
</html>