blob: 8efab900ad5169f7eabb5b3c36f394b03f29af48 [file] [log] [blame]
<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>apache_beam.io.filebasedsink &mdash; Apache Beam 2.47.0 documentation</title>
<script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script>
<script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
<script type="text/javascript" src="../../../_static/jquery.js"></script>
<script type="text/javascript" src="../../../_static/underscore.js"></script>
<script type="text/javascript" src="../../../_static/doctools.js"></script>
<script type="text/javascript" src="../../../_static/language_data.js"></script>
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/javascript" src="../../../_static/js/theme.js"></script>
<link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<link rel="index" title="Index" href="../../../genindex.html" />
<link rel="search" title="Search" href="../../../search.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../../index.html" class="icon icon-home"> Apache Beam
</a>
<div class="version">
2.47.0
</div>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.coders.html">apache_beam.coders package</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.dataframe.html">apache_beam.dataframe package</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.io.html">apache_beam.io package</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.metrics.html">apache_beam.metrics package</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.ml.html">apache_beam.ml package</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.options.html">apache_beam.options package</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.portability.html">apache_beam.portability package</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.runners.html">apache_beam.runners package</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.testing.html">apache_beam.testing package</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.transforms.html">apache_beam.transforms package</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.typehints.html">apache_beam.typehints package</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.utils.html">apache_beam.utils package</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.yaml.html">apache_beam.yaml package</a></li>
</ul>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.error.html">apache_beam.error module</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.pipeline.html">apache_beam.pipeline module</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.pvalue.html">apache_beam.pvalue module</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" aria-label="top navigation">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../../index.html">Apache Beam</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../../index.html">Docs</a> &raquo;</li>
<li><a href="../../index.html">Module code</a> &raquo;</li>
<li>apache_beam.io.filebasedsink</li>
<li class="wy-breadcrumbs-aside">
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<h1>Source code for apache_beam.io.filebasedsink</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="sd">&quot;&quot;&quot;File-based sink.&quot;&quot;&quot;</span>
<span class="c1"># pytype: skip-file</span>
<span class="kn">import</span> <span class="nn">logging</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">re</span>
<span class="kn">import</span> <span class="nn">time</span>
<span class="kn">import</span> <span class="nn">uuid</span>
<span class="kn">from</span> <span class="nn">apache_beam.internal</span> <span class="kn">import</span> <span class="n">util</span>
<span class="kn">from</span> <span class="nn">apache_beam.io</span> <span class="kn">import</span> <span class="n">iobase</span>
<span class="kn">from</span> <span class="nn">apache_beam.io.filesystem</span> <span class="kn">import</span> <span class="n">BeamIOError</span>
<span class="kn">from</span> <span class="nn">apache_beam.io.filesystem</span> <span class="kn">import</span> <span class="n">CompressionTypes</span>
<span class="kn">from</span> <span class="nn">apache_beam.io.filesystems</span> <span class="kn">import</span> <span class="n">FileSystems</span>
<span class="kn">from</span> <span class="nn">apache_beam.options.value_provider</span> <span class="kn">import</span> <span class="n">StaticValueProvider</span>
<span class="kn">from</span> <span class="nn">apache_beam.options.value_provider</span> <span class="kn">import</span> <span class="n">ValueProvider</span>
<span class="kn">from</span> <span class="nn">apache_beam.options.value_provider</span> <span class="kn">import</span> <span class="n">check_accessible</span>
<span class="kn">from</span> <span class="nn">apache_beam.transforms.display</span> <span class="kn">import</span> <span class="n">DisplayDataItem</span>
<span class="n">DEFAULT_SHARD_NAME_TEMPLATE</span> <span class="o">=</span> <span class="s1">&#39;-SSSSS-of-NNNNN&#39;</span>
<span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;FileBasedSink&#39;</span><span class="p">]</span>
<span class="n">_LOGGER</span> <span class="o">=</span> <span class="n">logging</span><span class="o">.</span><span class="n">getLogger</span><span class="p">(</span><span class="vm">__name__</span><span class="p">)</span>
<div class="viewcode-block" id="FileBasedSink"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink">[docs]</a><span class="k">class</span> <span class="nc">FileBasedSink</span><span class="p">(</span><span class="n">iobase</span><span class="o">.</span><span class="n">Sink</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;A sink to a GCS or local files.</span>
<span class="sd"> To implement a file-based sink, extend this class and override</span>
<span class="sd"> either :meth:`.write_record()` or :meth:`.write_encoded_record()`.</span>
<span class="sd"> If needed, also overwrite :meth:`.open()` and/or :meth:`.close()` to customize</span>
<span class="sd"> the file handling or write headers and footers.</span>
<span class="sd"> The output of this write is a :class:`~apache_beam.pvalue.PCollection` of</span>
<span class="sd"> all written shards.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># Max number of threads to be used for renaming.</span>
<span class="n">_MAX_RENAME_THREADS</span> <span class="o">=</span> <span class="mi">64</span>
<span class="fm">__hash__</span> <span class="o">=</span> <span class="kc">None</span> <span class="c1"># type: ignore[assignment]</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">file_path_prefix</span><span class="p">,</span>
<span class="n">coder</span><span class="p">,</span>
<span class="n">file_name_suffix</span><span class="o">=</span><span class="s1">&#39;&#39;</span><span class="p">,</span>
<span class="n">num_shards</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
<span class="n">shard_name_template</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">mime_type</span><span class="o">=</span><span class="s1">&#39;application/octet-stream&#39;</span><span class="p">,</span>
<span class="n">compression_type</span><span class="o">=</span><span class="n">CompressionTypes</span><span class="o">.</span><span class="n">AUTO</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">max_records_per_shard</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">max_bytes_per_shard</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="n">skip_if_empty</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Raises:</span>
<span class="sd"> TypeError: if file path parameters are not a :class:`str` or</span>
<span class="sd"> :class:`~apache_beam.options.value_provider.ValueProvider`, or if</span>
<span class="sd"> **compression_type** is not member of</span>
<span class="sd"> :class:`~apache_beam.io.filesystem.CompressionTypes`.</span>
<span class="sd"> ValueError: if **shard_name_template** is not of expected</span>
<span class="sd"> format.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">file_path_prefix</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">ValueProvider</span><span class="p">)):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s1">&#39;file_path_prefix must be a string or ValueProvider;&#39;</span>
<span class="s1">&#39;got </span><span class="si">%r</span><span class="s1"> instead&#39;</span> <span class="o">%</span> <span class="n">file_path_prefix</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">file_name_suffix</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">ValueProvider</span><span class="p">)):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s1">&#39;file_name_suffix must be a string or ValueProvider;&#39;</span>
<span class="s1">&#39;got </span><span class="si">%r</span><span class="s1"> instead&#39;</span> <span class="o">%</span> <span class="n">file_name_suffix</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">CompressionTypes</span><span class="o">.</span><span class="n">is_valid_compression_type</span><span class="p">(</span><span class="n">compression_type</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s1">&#39;compression_type must be CompressionType object but &#39;</span>
<span class="s1">&#39;was </span><span class="si">%s</span><span class="s1">&#39;</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">compression_type</span><span class="p">))</span>
<span class="k">if</span> <span class="n">shard_name_template</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">shard_name_template</span> <span class="o">=</span> <span class="n">DEFAULT_SHARD_NAME_TEMPLATE</span>
<span class="k">elif</span> <span class="n">shard_name_template</span> <span class="o">==</span> <span class="s1">&#39;&#39;</span><span class="p">:</span>
<span class="n">num_shards</span> <span class="o">=</span> <span class="mi">1</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">file_path_prefix</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">file_path_prefix</span> <span class="o">=</span> <span class="n">StaticValueProvider</span><span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">file_path_prefix</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">file_name_suffix</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">file_name_suffix</span> <span class="o">=</span> <span class="n">StaticValueProvider</span><span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">file_name_suffix</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">file_path_prefix</span> <span class="o">=</span> <span class="n">file_path_prefix</span>
<span class="bp">self</span><span class="o">.</span><span class="n">file_name_suffix</span> <span class="o">=</span> <span class="n">file_name_suffix</span>
<span class="bp">self</span><span class="o">.</span><span class="n">num_shards</span> <span class="o">=</span> <span class="n">num_shards</span>
<span class="bp">self</span><span class="o">.</span><span class="n">coder</span> <span class="o">=</span> <span class="n">coder</span>
<span class="bp">self</span><span class="o">.</span><span class="n">shard_name_format</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_template_to_format</span><span class="p">(</span><span class="n">shard_name_template</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">shard_name_glob_format</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_template_to_glob_format</span><span class="p">(</span>
<span class="n">shard_name_template</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">compression_type</span> <span class="o">=</span> <span class="n">compression_type</span>
<span class="bp">self</span><span class="o">.</span><span class="n">mime_type</span> <span class="o">=</span> <span class="n">mime_type</span>
<span class="bp">self</span><span class="o">.</span><span class="n">max_records_per_shard</span> <span class="o">=</span> <span class="n">max_records_per_shard</span>
<span class="bp">self</span><span class="o">.</span><span class="n">max_bytes_per_shard</span> <span class="o">=</span> <span class="n">max_bytes_per_shard</span>
<span class="bp">self</span><span class="o">.</span><span class="n">skip_if_empty</span> <span class="o">=</span> <span class="n">skip_if_empty</span>
<div class="viewcode-block" id="FileBasedSink.display_data"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink.display_data">[docs]</a> <span class="k">def</span> <span class="nf">display_data</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="p">{</span>
<span class="s1">&#39;shards&#39;</span><span class="p">:</span> <span class="n">DisplayDataItem</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">num_shards</span><span class="p">,</span>
<span class="n">label</span><span class="o">=</span><span class="s1">&#39;Number of Shards&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">drop_if_default</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span>
<span class="s1">&#39;compression&#39;</span><span class="p">:</span> <span class="n">DisplayDataItem</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">compression_type</span><span class="p">)),</span>
<span class="s1">&#39;file_pattern&#39;</span><span class="p">:</span> <span class="n">DisplayDataItem</span><span class="p">(</span>
<span class="s1">&#39;</span><span class="si">{}{}{}</span><span class="s1">&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">file_path_prefix</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">shard_name_format</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">file_name_suffix</span><span class="p">),</span>
<span class="n">label</span><span class="o">=</span><span class="s1">&#39;File Pattern&#39;</span><span class="p">)</span>
<span class="p">}</span></div>
<div class="viewcode-block" id="FileBasedSink.open"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink.open">[docs]</a> <span class="nd">@check_accessible</span><span class="p">([</span><span class="s1">&#39;file_path_prefix&#39;</span><span class="p">])</span>
<span class="k">def</span> <span class="nf">open</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">temp_path</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Opens ``temp_path``, returning an opaque file handle object.</span>
<span class="sd"> The returned file handle is passed to ``write_[encoded_]record`` and</span>
<span class="sd"> ``close``.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">writer</span> <span class="o">=</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">create</span><span class="p">(</span>
<span class="n">temp_path</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">mime_type</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">compression_type</span><span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_bytes_per_shard</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">byte_counter</span> <span class="o">=</span> <span class="n">_ByteCountingWriter</span><span class="p">(</span><span class="n">writer</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">byte_counter</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">writer</span></div>
<div class="viewcode-block" id="FileBasedSink.write_record"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink.write_record">[docs]</a> <span class="k">def</span> <span class="nf">write_record</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">file_handle</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Writes a single record go the file handle returned by ``open()``.</span>
<span class="sd"> By default, calls ``write_encoded_record`` after encoding the record with</span>
<span class="sd"> this sink&#39;s Coder.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">write_encoded_record</span><span class="p">(</span><span class="n">file_handle</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">coder</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="n">value</span><span class="p">))</span></div>
<div class="viewcode-block" id="FileBasedSink.write_encoded_record"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink.write_encoded_record">[docs]</a> <span class="k">def</span> <span class="nf">write_encoded_record</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">file_handle</span><span class="p">,</span> <span class="n">encoded_value</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Writes a single encoded record to the file handle returned by ``open()``.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span></div>
<div class="viewcode-block" id="FileBasedSink.close"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink.close">[docs]</a> <span class="k">def</span> <span class="nf">close</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">file_handle</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Finalize and close the file handle returned from ``open()``.</span>
<span class="sd"> Called after all records are written.</span>
<span class="sd"> By default, calls ``file_handle.close()`` iff it is not None.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">file_handle</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">file_handle</span><span class="o">.</span><span class="n">close</span><span class="p">()</span></div>
<div class="viewcode-block" id="FileBasedSink.initialize_write"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink.initialize_write">[docs]</a> <span class="nd">@check_accessible</span><span class="p">([</span><span class="s1">&#39;file_path_prefix&#39;</span><span class="p">,</span> <span class="s1">&#39;file_name_suffix&#39;</span><span class="p">])</span>
<span class="k">def</span> <span class="nf">initialize_write</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="n">file_path_prefix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">file_path_prefix</span><span class="o">.</span><span class="n">get</span><span class="p">()</span>
<span class="n">tmp_dir</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_create_temp_dir</span><span class="p">(</span><span class="n">file_path_prefix</span><span class="p">)</span>
<span class="n">FileSystems</span><span class="o">.</span><span class="n">mkdirs</span><span class="p">(</span><span class="n">tmp_dir</span><span class="p">)</span>
<span class="k">return</span> <span class="n">tmp_dir</span></div>
<span class="k">def</span> <span class="nf">_create_temp_dir</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">file_path_prefix</span><span class="p">):</span>
<span class="n">base_path</span><span class="p">,</span> <span class="n">last_component</span> <span class="o">=</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">file_path_prefix</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">last_component</span><span class="p">:</span>
<span class="c1"># Trying to re-split the base_path to check if it&#39;s a root.</span>
<span class="n">new_base_path</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">base_path</span><span class="p">)</span>
<span class="k">if</span> <span class="n">base_path</span> <span class="o">==</span> <span class="n">new_base_path</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s1">&#39;Cannot create a temporary directory for root path &#39;</span>
<span class="s1">&#39;prefix </span><span class="si">%s</span><span class="s1">. Please specify a file path prefix with &#39;</span>
<span class="s1">&#39;at least two components.&#39;</span> <span class="o">%</span> <span class="n">file_path_prefix</span><span class="p">)</span>
<span class="n">path_components</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">base_path</span><span class="p">,</span> <span class="s1">&#39;beam-temp-&#39;</span> <span class="o">+</span> <span class="n">last_component</span> <span class="o">+</span> <span class="s1">&#39;-&#39;</span> <span class="o">+</span> <span class="n">uuid</span><span class="o">.</span><span class="n">uuid1</span><span class="p">()</span><span class="o">.</span><span class="n">hex</span>
<span class="p">]</span>
<span class="k">return</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="o">*</span><span class="n">path_components</span><span class="p">)</span>
<div class="viewcode-block" id="FileBasedSink.open_writer"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink.open_writer">[docs]</a> <span class="nd">@check_accessible</span><span class="p">([</span><span class="s1">&#39;file_path_prefix&#39;</span><span class="p">,</span> <span class="s1">&#39;file_name_suffix&#39;</span><span class="p">])</span>
<span class="k">def</span> <span class="nf">open_writer</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">init_result</span><span class="p">,</span> <span class="n">uid</span><span class="p">):</span>
<span class="c1"># A proper suffix is needed for AUTO compression detection.</span>
<span class="c1"># We also ensure there will be no collisions with uid and a</span>
<span class="c1"># (possibly unsharded) file_path_prefix and a (possibly empty)</span>
<span class="c1"># file_name_suffix.</span>
<span class="n">file_path_prefix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">file_path_prefix</span><span class="o">.</span><span class="n">get</span><span class="p">()</span>
<span class="n">file_name_suffix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">file_name_suffix</span><span class="o">.</span><span class="n">get</span><span class="p">()</span>
<span class="n">suffix</span> <span class="o">=</span> <span class="p">(</span><span class="s1">&#39;.&#39;</span> <span class="o">+</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">basename</span><span class="p">(</span><span class="n">file_path_prefix</span><span class="p">)</span> <span class="o">+</span> <span class="n">file_name_suffix</span><span class="p">)</span>
<span class="n">writer_path</span> <span class="o">=</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">init_result</span><span class="p">,</span> <span class="n">uid</span><span class="p">)</span> <span class="o">+</span> <span class="n">suffix</span>
<span class="k">return</span> <span class="n">FileBasedSinkWriter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">writer_path</span><span class="p">)</span></div>
<span class="nd">@check_accessible</span><span class="p">([</span><span class="s1">&#39;file_path_prefix&#39;</span><span class="p">,</span> <span class="s1">&#39;file_name_suffix&#39;</span><span class="p">])</span>
<span class="k">def</span> <span class="nf">_get_final_name</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">shard_num</span><span class="p">,</span> <span class="n">num_shards</span><span class="p">):</span>
<span class="k">return</span> <span class="s1">&#39;&#39;</span><span class="o">.</span><span class="n">join</span><span class="p">([</span>
<span class="bp">self</span><span class="o">.</span><span class="n">file_path_prefix</span><span class="o">.</span><span class="n">get</span><span class="p">(),</span>
<span class="bp">self</span><span class="o">.</span><span class="n">shard_name_format</span> <span class="o">%</span>
<span class="nb">dict</span><span class="p">(</span><span class="n">shard_num</span><span class="o">=</span><span class="n">shard_num</span><span class="p">,</span> <span class="n">num_shards</span><span class="o">=</span><span class="n">num_shards</span><span class="p">),</span>
<span class="bp">self</span><span class="o">.</span><span class="n">file_name_suffix</span><span class="o">.</span><span class="n">get</span><span class="p">()</span>
<span class="p">])</span>
<span class="nd">@check_accessible</span><span class="p">([</span><span class="s1">&#39;file_path_prefix&#39;</span><span class="p">,</span> <span class="s1">&#39;file_name_suffix&#39;</span><span class="p">])</span>
<span class="k">def</span> <span class="nf">_get_final_name_glob</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">num_shards</span><span class="p">):</span>
<span class="k">return</span> <span class="s1">&#39;&#39;</span><span class="o">.</span><span class="n">join</span><span class="p">([</span>
<span class="bp">self</span><span class="o">.</span><span class="n">file_path_prefix</span><span class="o">.</span><span class="n">get</span><span class="p">(),</span>
<span class="bp">self</span><span class="o">.</span><span class="n">shard_name_glob_format</span> <span class="o">%</span> <span class="nb">dict</span><span class="p">(</span><span class="n">num_shards</span><span class="o">=</span><span class="n">num_shards</span><span class="p">),</span>
<span class="bp">self</span><span class="o">.</span><span class="n">file_name_suffix</span><span class="o">.</span><span class="n">get</span><span class="p">()</span>
<span class="p">])</span>
<div class="viewcode-block" id="FileBasedSink.pre_finalize"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink.pre_finalize">[docs]</a> <span class="k">def</span> <span class="nf">pre_finalize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">init_result</span><span class="p">,</span> <span class="n">writer_results</span><span class="p">):</span>
<span class="n">num_shards</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">writer_results</span><span class="p">))</span>
<span class="n">dst_glob</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_final_name_glob</span><span class="p">(</span><span class="n">num_shards</span><span class="p">)</span>
<span class="n">dst_glob_files</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">file_metadata</span><span class="o">.</span><span class="n">path</span> <span class="k">for</span> <span class="n">mr</span> <span class="ow">in</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">match</span><span class="p">([</span><span class="n">dst_glob</span><span class="p">])</span>
<span class="k">for</span> <span class="n">file_metadata</span> <span class="ow">in</span> <span class="n">mr</span><span class="o">.</span><span class="n">metadata_list</span>
<span class="p">]</span>
<span class="k">if</span> <span class="n">dst_glob_files</span><span class="p">:</span>
<span class="n">_LOGGER</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
<span class="s1">&#39;Deleting </span><span class="si">%d</span><span class="s1"> existing files in target path matching: </span><span class="si">%s</span><span class="s1">&#39;</span><span class="p">,</span>
<span class="nb">len</span><span class="p">(</span><span class="n">dst_glob_files</span><span class="p">),</span>
<span class="bp">self</span><span class="o">.</span><span class="n">shard_name_glob_format</span><span class="p">)</span>
<span class="n">FileSystems</span><span class="o">.</span><span class="n">delete</span><span class="p">(</span><span class="n">dst_glob_files</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_check_state_for_finalize_write</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">writer_results</span><span class="p">,</span> <span class="n">num_shards</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Checks writer output files&#39; states.</span>
<span class="sd"> Returns:</span>
<span class="sd"> src_files, dst_files: Lists of files to rename. For each i, finalize_write</span>
<span class="sd"> should rename(src_files[i], dst_files[i]).</span>
<span class="sd"> delete_files: Src files to delete. These could be leftovers from an</span>
<span class="sd"> incomplete (non-atomic) rename operation.</span>
<span class="sd"> num_skipped: Tally of writer results files already renamed, such as from</span>
<span class="sd"> a previous run of finalize_write().</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">writer_results</span><span class="p">:</span>
<span class="k">return</span> <span class="p">[],</span> <span class="p">[],</span> <span class="p">[],</span> <span class="mi">0</span>
<span class="n">src_glob</span> <span class="o">=</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">FileSystems</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">writer_results</span><span class="p">[</span><span class="mi">0</span><span class="p">])[</span><span class="mi">0</span><span class="p">],</span> <span class="s1">&#39;*&#39;</span><span class="p">)</span>
<span class="n">dst_glob</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_final_name_glob</span><span class="p">(</span><span class="n">num_shards</span><span class="p">)</span>
<span class="n">src_glob_files</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span>
<span class="n">file_metadata</span><span class="o">.</span><span class="n">path</span> <span class="k">for</span> <span class="n">mr</span> <span class="ow">in</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">match</span><span class="p">([</span><span class="n">src_glob</span><span class="p">])</span>
<span class="k">for</span> <span class="n">file_metadata</span> <span class="ow">in</span> <span class="n">mr</span><span class="o">.</span><span class="n">metadata_list</span><span class="p">)</span>
<span class="n">dst_glob_files</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span>
<span class="n">file_metadata</span><span class="o">.</span><span class="n">path</span> <span class="k">for</span> <span class="n">mr</span> <span class="ow">in</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">match</span><span class="p">([</span><span class="n">dst_glob</span><span class="p">])</span>
<span class="k">for</span> <span class="n">file_metadata</span> <span class="ow">in</span> <span class="n">mr</span><span class="o">.</span><span class="n">metadata_list</span><span class="p">)</span>
<span class="n">src_files</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">dst_files</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">delete_files</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">num_skipped</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">for</span> <span class="n">shard_num</span><span class="p">,</span> <span class="n">src</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">writer_results</span><span class="p">):</span>
<span class="n">final_name</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_final_name</span><span class="p">(</span><span class="n">shard_num</span><span class="p">,</span> <span class="n">num_shards</span><span class="p">)</span>
<span class="n">dst</span> <span class="o">=</span> <span class="n">final_name</span>
<span class="n">src_exists</span> <span class="o">=</span> <span class="n">src</span> <span class="ow">in</span> <span class="n">src_glob_files</span>
<span class="n">dst_exists</span> <span class="o">=</span> <span class="n">dst</span> <span class="ow">in</span> <span class="n">dst_glob_files</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">src_exists</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">dst_exists</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">BeamIOError</span><span class="p">(</span>
<span class="s1">&#39;src and dst files do not exist. src: </span><span class="si">%s</span><span class="s1">, dst: </span><span class="si">%s</span><span class="s1">&#39;</span> <span class="o">%</span> <span class="p">(</span><span class="n">src</span><span class="p">,</span> <span class="n">dst</span><span class="p">))</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">src_exists</span> <span class="ow">and</span> <span class="n">dst_exists</span><span class="p">:</span>
<span class="n">_LOGGER</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="s1">&#39;src: </span><span class="si">%s</span><span class="s1"> -&gt; dst: </span><span class="si">%s</span><span class="s1"> already renamed, skipping&#39;</span><span class="p">,</span> <span class="n">src</span><span class="p">,</span> <span class="n">dst</span><span class="p">)</span>
<span class="n">num_skipped</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="k">continue</span>
<span class="k">if</span> <span class="p">(</span><span class="n">src_exists</span> <span class="ow">and</span> <span class="n">dst_exists</span> <span class="ow">and</span>
<span class="n">FileSystems</span><span class="o">.</span><span class="n">checksum</span><span class="p">(</span><span class="n">src</span><span class="p">)</span> <span class="o">==</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">checksum</span><span class="p">(</span><span class="n">dst</span><span class="p">)):</span>
<span class="n">_LOGGER</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="s1">&#39;src: </span><span class="si">%s</span><span class="s1"> == dst: </span><span class="si">%s</span><span class="s1">, deleting src&#39;</span><span class="p">,</span> <span class="n">src</span><span class="p">,</span> <span class="n">dst</span><span class="p">)</span>
<span class="n">delete_files</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">src</span><span class="p">)</span>
<span class="k">continue</span>
<span class="n">src_files</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">src</span><span class="p">)</span>
<span class="n">dst_files</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">dst</span><span class="p">)</span>
<span class="k">return</span> <span class="n">src_files</span><span class="p">,</span> <span class="n">dst_files</span><span class="p">,</span> <span class="n">delete_files</span><span class="p">,</span> <span class="n">num_skipped</span>
<div class="viewcode-block" id="FileBasedSink.finalize_write"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink.finalize_write">[docs]</a> <span class="nd">@check_accessible</span><span class="p">([</span><span class="s1">&#39;file_path_prefix&#39;</span><span class="p">])</span>
<span class="k">def</span> <span class="nf">finalize_write</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">init_result</span><span class="p">,</span> <span class="n">writer_results</span><span class="p">,</span> <span class="n">unused_pre_finalize_results</span><span class="p">):</span>
<span class="n">writer_results</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">writer_results</span><span class="p">)</span>
<span class="n">num_shards</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">writer_results</span><span class="p">)</span>
<span class="n">src_files</span><span class="p">,</span> <span class="n">dst_files</span><span class="p">,</span> <span class="n">delete_files</span><span class="p">,</span> <span class="n">num_skipped</span> <span class="o">=</span> <span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_check_state_for_finalize_write</span><span class="p">(</span><span class="n">writer_results</span><span class="p">,</span> <span class="n">num_shards</span><span class="p">))</span>
<span class="n">num_skipped</span> <span class="o">+=</span> <span class="nb">len</span><span class="p">(</span><span class="n">delete_files</span><span class="p">)</span>
<span class="n">FileSystems</span><span class="o">.</span><span class="n">delete</span><span class="p">(</span><span class="n">delete_files</span><span class="p">)</span>
<span class="n">num_shards_to_finalize</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">src_files</span><span class="p">)</span>
<span class="n">min_threads</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">num_shards_to_finalize</span><span class="p">,</span> <span class="n">FileBasedSink</span><span class="o">.</span><span class="n">_MAX_RENAME_THREADS</span><span class="p">)</span>
<span class="n">num_threads</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">min_threads</span><span class="p">)</span>
<span class="n">chunk_size</span> <span class="o">=</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">get_chunk_size</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">file_path_prefix</span><span class="o">.</span><span class="n">get</span><span class="p">())</span>
<span class="n">source_file_batch</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">src_files</span><span class="p">[</span><span class="n">i</span><span class="p">:</span><span class="n">i</span> <span class="o">+</span> <span class="n">chunk_size</span><span class="p">]</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">src_files</span><span class="p">),</span> <span class="n">chunk_size</span><span class="p">)</span>
<span class="p">]</span>
<span class="n">destination_file_batch</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">dst_files</span><span class="p">[</span><span class="n">i</span><span class="p">:</span><span class="n">i</span> <span class="o">+</span> <span class="n">chunk_size</span><span class="p">]</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">dst_files</span><span class="p">),</span> <span class="n">chunk_size</span><span class="p">)</span>
<span class="p">]</span>
<span class="k">if</span> <span class="n">num_shards_to_finalize</span><span class="p">:</span>
<span class="n">_LOGGER</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
<span class="s1">&#39;Starting finalize_write threads with num_shards: </span><span class="si">%d</span><span class="s1"> (skipped: </span><span class="si">%d</span><span class="s1">), &#39;</span>
<span class="s1">&#39;batches: </span><span class="si">%d</span><span class="s1">, num_threads: </span><span class="si">%d</span><span class="s1">&#39;</span><span class="p">,</span>
<span class="n">num_shards_to_finalize</span><span class="p">,</span>
<span class="n">num_skipped</span><span class="p">,</span>
<span class="nb">len</span><span class="p">(</span><span class="n">source_file_batch</span><span class="p">),</span>
<span class="n">num_threads</span><span class="p">)</span>
<span class="n">start_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
<span class="c1"># Use a thread pool for renaming operations.</span>
<span class="k">def</span> <span class="nf">_rename_batch</span><span class="p">(</span><span class="n">batch</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;_rename_batch executes batch rename operations.&quot;&quot;&quot;</span>
<span class="n">source_files</span><span class="p">,</span> <span class="n">destination_files</span> <span class="o">=</span> <span class="n">batch</span>
<span class="n">exceptions</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">FileSystems</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">source_files</span><span class="p">,</span> <span class="n">destination_files</span><span class="p">)</span>
<span class="k">return</span> <span class="n">exceptions</span>
<span class="k">except</span> <span class="n">BeamIOError</span> <span class="k">as</span> <span class="n">exp</span><span class="p">:</span>
<span class="k">if</span> <span class="n">exp</span><span class="o">.</span><span class="n">exception_details</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span>
<span class="k">for</span> <span class="p">(</span><span class="n">src</span><span class="p">,</span> <span class="n">dst</span><span class="p">),</span> <span class="n">exception</span> <span class="ow">in</span> <span class="n">exp</span><span class="o">.</span><span class="n">exception_details</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="k">if</span> <span class="n">exception</span><span class="p">:</span>
<span class="n">_LOGGER</span><span class="o">.</span><span class="n">error</span><span class="p">(</span>
<span class="p">(</span><span class="s1">&#39;Exception in _rename_batch. src: </span><span class="si">%s</span><span class="s1">, &#39;</span>
<span class="s1">&#39;dst: </span><span class="si">%s</span><span class="s1">, err: </span><span class="si">%s</span><span class="s1">&#39;</span><span class="p">),</span>
<span class="n">src</span><span class="p">,</span>
<span class="n">dst</span><span class="p">,</span>
<span class="n">exception</span><span class="p">)</span>
<span class="n">exceptions</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">exception</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">_LOGGER</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="s1">&#39;Rename successful: </span><span class="si">%s</span><span class="s1"> -&gt; </span><span class="si">%s</span><span class="s1">&#39;</span><span class="p">,</span> <span class="n">src</span><span class="p">,</span> <span class="n">dst</span><span class="p">)</span>
<span class="k">return</span> <span class="n">exceptions</span>
<span class="n">exception_batches</span> <span class="o">=</span> <span class="n">util</span><span class="o">.</span><span class="n">run_using_threadpool</span><span class="p">(</span>
<span class="n">_rename_batch</span><span class="p">,</span>
<span class="nb">list</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span><span class="n">source_file_batch</span><span class="p">,</span> <span class="n">destination_file_batch</span><span class="p">)),</span>
<span class="n">num_threads</span><span class="p">)</span>
<span class="n">all_exceptions</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">e</span> <span class="k">for</span> <span class="n">exception_batch</span> <span class="ow">in</span> <span class="n">exception_batches</span> <span class="k">for</span> <span class="n">e</span> <span class="ow">in</span> <span class="n">exception_batch</span>
<span class="p">]</span>
<span class="k">if</span> <span class="n">all_exceptions</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span>
<span class="s1">&#39;Encountered exceptions in finalize_write: </span><span class="si">%s</span><span class="s1">&#39;</span> <span class="o">%</span> <span class="n">all_exceptions</span><span class="p">)</span>
<span class="k">yield from</span> <span class="n">dst_files</span>
<span class="n">_LOGGER</span><span class="o">.</span><span class="n">info</span><span class="p">(</span>
<span class="s1">&#39;Renamed </span><span class="si">%d</span><span class="s1"> shards in </span><span class="si">%.2f</span><span class="s1"> seconds.&#39;</span><span class="p">,</span>
<span class="n">num_shards_to_finalize</span><span class="p">,</span>
<span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span> <span class="o">-</span> <span class="n">start_time</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">_LOGGER</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
<span class="s1">&#39;No shards found to finalize. num_shards: </span><span class="si">%d</span><span class="s1">, skipped: </span><span class="si">%d</span><span class="s1">&#39;</span><span class="p">,</span>
<span class="n">num_shards</span><span class="p">,</span>
<span class="n">num_skipped</span><span class="p">)</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">FileSystems</span><span class="o">.</span><span class="n">delete</span><span class="p">([</span><span class="n">init_result</span><span class="p">])</span>
<span class="k">except</span> <span class="ne">IOError</span><span class="p">:</span>
<span class="c1"># This error is not serious, we simply log it.</span>
<span class="n">_LOGGER</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s1">&#39;Unable to delete file: </span><span class="si">%s</span><span class="s1">&#39;</span><span class="p">,</span> <span class="n">init_result</span><span class="p">)</span></div>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_template_replace_num_shards</span><span class="p">(</span><span class="n">shard_name_template</span><span class="p">):</span>
<span class="n">match</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="s1">&#39;N+&#39;</span><span class="p">,</span> <span class="n">shard_name_template</span><span class="p">)</span>
<span class="k">if</span> <span class="n">match</span><span class="p">:</span>
<span class="n">shard_name_template</span> <span class="o">=</span> <span class="n">shard_name_template</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span>
<span class="k">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="s1">&#39;</span><span class="si">%%</span><span class="s1">(num_shards)0</span><span class="si">%d</span><span class="s1">d&#39;</span> <span class="o">%</span> <span class="nb">len</span><span class="p">(</span><span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">0</span><span class="p">)))</span>
<span class="k">return</span> <span class="n">shard_name_template</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_template_to_format</span><span class="p">(</span><span class="n">shard_name_template</span><span class="p">):</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">shard_name_template</span><span class="p">:</span>
<span class="k">return</span> <span class="s1">&#39;&#39;</span>
<span class="n">match</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="s1">&#39;S+&#39;</span><span class="p">,</span> <span class="n">shard_name_template</span><span class="p">)</span>
<span class="k">if</span> <span class="n">match</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Shard number pattern S+ not found in shard_name_template: </span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span>
<span class="n">shard_name_template</span><span class="p">)</span>
<span class="n">shard_name_format</span> <span class="o">=</span> <span class="n">shard_name_template</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span>
<span class="k">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="s1">&#39;</span><span class="si">%%</span><span class="s1">(shard_num)0</span><span class="si">%d</span><span class="s1">d&#39;</span> <span class="o">%</span> <span class="nb">len</span><span class="p">(</span><span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">0</span><span class="p">)))</span>
<span class="k">return</span> <span class="n">FileBasedSink</span><span class="o">.</span><span class="n">_template_replace_num_shards</span><span class="p">(</span><span class="n">shard_name_format</span><span class="p">)</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_template_to_glob_format</span><span class="p">(</span><span class="n">shard_name_template</span><span class="p">):</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">shard_name_template</span><span class="p">:</span>
<span class="k">return</span> <span class="s1">&#39;&#39;</span>
<span class="n">match</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="s1">&#39;S+&#39;</span><span class="p">,</span> <span class="n">shard_name_template</span><span class="p">)</span>
<span class="k">if</span> <span class="n">match</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Shard number pattern S+ not found in shard_name_template: </span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span>
<span class="n">shard_name_template</span><span class="p">)</span>
<span class="n">shard_name_format</span> <span class="o">=</span> <span class="n">shard_name_template</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="s1">&#39;*&#39;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">FileBasedSink</span><span class="o">.</span><span class="n">_template_replace_num_shards</span><span class="p">(</span><span class="n">shard_name_format</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__eq__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span>
<span class="c1"># TODO: Clean up workitem_test which uses this.</span>
<span class="c1"># pylint: disable=unidiomatic-typecheck</span>
<span class="k">return</span> <span class="nb">type</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">==</span> <span class="nb">type</span><span class="p">(</span><span class="n">other</span><span class="p">)</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="vm">__dict__</span> <span class="o">==</span> <span class="n">other</span><span class="o">.</span><span class="vm">__dict__</span></div>
<span class="k">class</span> <span class="nc">FileBasedSinkWriter</span><span class="p">(</span><span class="n">iobase</span><span class="o">.</span><span class="n">Writer</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;The writer for FileBasedSink.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">sink</span><span class="p">,</span> <span class="n">temp_shard_path</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">sink</span> <span class="o">=</span> <span class="n">sink</span>
<span class="bp">self</span><span class="o">.</span><span class="n">temp_shard_path</span> <span class="o">=</span> <span class="n">temp_shard_path</span>
<span class="bp">self</span><span class="o">.</span><span class="n">temp_handle</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">sink</span><span class="o">.</span><span class="n">open</span><span class="p">(</span><span class="n">temp_shard_path</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">num_records_written</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">def</span> <span class="nf">write</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">num_records_written</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="bp">self</span><span class="o">.</span><span class="n">sink</span><span class="o">.</span><span class="n">write_record</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">temp_handle</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">at_capacity</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">return</span> <span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">sink</span><span class="o">.</span><span class="n">max_records_per_shard</span> <span class="ow">and</span>
<span class="bp">self</span><span class="o">.</span><span class="n">num_records_written</span> <span class="o">&gt;=</span> <span class="bp">self</span><span class="o">.</span><span class="n">sink</span><span class="o">.</span><span class="n">max_records_per_shard</span>
<span class="p">)</span> <span class="ow">or</span> <span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">sink</span><span class="o">.</span><span class="n">max_bytes_per_shard</span> <span class="ow">and</span>
<span class="bp">self</span><span class="o">.</span><span class="n">sink</span><span class="o">.</span><span class="n">byte_counter</span><span class="o">.</span><span class="n">bytes_written</span> <span class="o">&gt;=</span> <span class="bp">self</span><span class="o">.</span><span class="n">sink</span><span class="o">.</span><span class="n">max_bytes_per_shard</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">close</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">sink</span><span class="o">.</span><span class="n">close</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">temp_handle</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">temp_shard_path</span>
<span class="k">class</span> <span class="nc">_ByteCountingWriter</span><span class="p">:</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">writer</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">writer</span> <span class="o">=</span> <span class="n">writer</span>
<span class="bp">self</span><span class="o">.</span><span class="n">bytes_written</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">def</span> <span class="nf">write</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">bs</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">bytes_written</span> <span class="o">+=</span> <span class="nb">len</span><span class="p">(</span><span class="n">bs</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">writer</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">bs</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">flush</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">writer</span><span class="o">.</span><span class="n">flush</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">close</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">writer</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
</pre></div>
</div>
</div>
<footer>
<hr/>
<div role="contentinfo">
<p>
&copy; Copyright
</p>
</div>
Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script type="text/javascript">
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>