| |
| |
| <!DOCTYPE html> |
| <!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]--> |
| <!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]--> |
| <head> |
| <meta charset="utf-8"> |
| |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| |
| <title>apache_beam.io.filebasedsink — Apache Beam 2.47.0 documentation</title> |
| |
| |
| |
| |
| |
| |
| |
| |
| <script type="text/javascript" src="../../../_static/js/modernizr.min.js"></script> |
| |
| |
| <script type="text/javascript" id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script> |
| <script type="text/javascript" src="../../../_static/jquery.js"></script> |
| <script type="text/javascript" src="../../../_static/underscore.js"></script> |
| <script type="text/javascript" src="../../../_static/doctools.js"></script> |
| <script type="text/javascript" src="../../../_static/language_data.js"></script> |
| <script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script> |
| |
| <script type="text/javascript" src="../../../_static/js/theme.js"></script> |
| |
| |
| |
| |
| <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" /> |
| <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" /> |
| <link rel="index" title="Index" href="../../../genindex.html" /> |
| <link rel="search" title="Search" href="../../../search.html" /> |
| </head> |
| |
| <body class="wy-body-for-nav"> |
| |
| |
| <div class="wy-grid-for-nav"> |
| |
| <nav data-toggle="wy-nav-shift" class="wy-nav-side"> |
| <div class="wy-side-scroll"> |
| <div class="wy-side-nav-search" > |
| |
| |
| |
| <a href="../../../index.html" class="icon icon-home"> Apache Beam |
| |
| |
| |
| </a> |
| |
| |
| |
| |
| <div class="version"> |
| 2.47.0 |
| </div> |
| |
| |
| |
| |
| <div role="search"> |
| <form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get"> |
| <input type="text" name="q" placeholder="Search docs" /> |
| <input type="hidden" name="check_keywords" value="yes" /> |
| <input type="hidden" name="area" value="default" /> |
| </form> |
| </div> |
| |
| |
| </div> |
| |
| <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation"> |
| |
| |
| |
| |
| |
| |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.coders.html">apache_beam.coders package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.dataframe.html">apache_beam.dataframe package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.io.html">apache_beam.io package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.metrics.html">apache_beam.metrics package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.ml.html">apache_beam.ml package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.options.html">apache_beam.options package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.portability.html">apache_beam.portability package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.runners.html">apache_beam.runners package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.testing.html">apache_beam.testing package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.transforms.html">apache_beam.transforms package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.typehints.html">apache_beam.typehints package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.utils.html">apache_beam.utils package</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.yaml.html">apache_beam.yaml package</a></li> |
| </ul> |
| <ul> |
| <li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.error.html">apache_beam.error module</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.pipeline.html">apache_beam.pipeline module</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="../../../apache_beam.pvalue.html">apache_beam.pvalue module</a></li> |
| </ul> |
| |
| |
| |
| </div> |
| </div> |
| </nav> |
| |
| <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"> |
| |
| |
| <nav class="wy-nav-top" aria-label="top navigation"> |
| |
| <i data-toggle="wy-nav-top" class="fa fa-bars"></i> |
| <a href="../../../index.html">Apache Beam</a> |
| |
| </nav> |
| |
| |
| <div class="wy-nav-content"> |
| |
| <div class="rst-content"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <div role="navigation" aria-label="breadcrumbs navigation"> |
| |
| <ul class="wy-breadcrumbs"> |
| |
| <li><a href="../../../index.html">Docs</a> »</li> |
| |
| <li><a href="../../index.html">Module code</a> »</li> |
| |
| <li>apache_beam.io.filebasedsink</li> |
| |
| |
| <li class="wy-breadcrumbs-aside"> |
| |
| </li> |
| |
| </ul> |
| |
| |
| <hr/> |
| </div> |
| <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article"> |
| <div itemprop="articleBody"> |
| |
| <h1>Source code for apache_beam.io.filebasedsink</h1><div class="highlight"><pre> |
| <span></span><span class="c1">#</span> |
| <span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span> |
| <span class="c1"># contributor license agreements. See the NOTICE file distributed with</span> |
| <span class="c1"># this work for additional information regarding copyright ownership.</span> |
| <span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span> |
| <span class="c1"># (the "License"); you may not use this file except in compliance with</span> |
| <span class="c1"># the License. You may obtain a copy of the License at</span> |
| <span class="c1">#</span> |
| <span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># Unless required by applicable law or agreed to in writing, software</span> |
| <span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span> |
| <span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span> |
| <span class="c1"># See the License for the specific language governing permissions and</span> |
| <span class="c1"># limitations under the License.</span> |
| <span class="c1">#</span> |
| |
| <span class="sd">"""File-based sink."""</span> |
| |
| <span class="c1"># pytype: skip-file</span> |
| |
| <span class="kn">import</span> <span class="nn">logging</span> |
| <span class="kn">import</span> <span class="nn">os</span> |
| <span class="kn">import</span> <span class="nn">re</span> |
| <span class="kn">import</span> <span class="nn">time</span> |
| <span class="kn">import</span> <span class="nn">uuid</span> |
| |
| <span class="kn">from</span> <span class="nn">apache_beam.internal</span> <span class="kn">import</span> <span class="n">util</span> |
| <span class="kn">from</span> <span class="nn">apache_beam.io</span> <span class="kn">import</span> <span class="n">iobase</span> |
| <span class="kn">from</span> <span class="nn">apache_beam.io.filesystem</span> <span class="kn">import</span> <span class="n">BeamIOError</span> |
| <span class="kn">from</span> <span class="nn">apache_beam.io.filesystem</span> <span class="kn">import</span> <span class="n">CompressionTypes</span> |
| <span class="kn">from</span> <span class="nn">apache_beam.io.filesystems</span> <span class="kn">import</span> <span class="n">FileSystems</span> |
| <span class="kn">from</span> <span class="nn">apache_beam.options.value_provider</span> <span class="kn">import</span> <span class="n">StaticValueProvider</span> |
| <span class="kn">from</span> <span class="nn">apache_beam.options.value_provider</span> <span class="kn">import</span> <span class="n">ValueProvider</span> |
| <span class="kn">from</span> <span class="nn">apache_beam.options.value_provider</span> <span class="kn">import</span> <span class="n">check_accessible</span> |
| <span class="kn">from</span> <span class="nn">apache_beam.transforms.display</span> <span class="kn">import</span> <span class="n">DisplayDataItem</span> |
| |
| <span class="n">DEFAULT_SHARD_NAME_TEMPLATE</span> <span class="o">=</span> <span class="s1">'-SSSSS-of-NNNNN'</span> |
| |
| <span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span><span class="s1">'FileBasedSink'</span><span class="p">]</span> |
| |
| <span class="n">_LOGGER</span> <span class="o">=</span> <span class="n">logging</span><span class="o">.</span><span class="n">getLogger</span><span class="p">(</span><span class="vm">__name__</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="FileBasedSink"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink">[docs]</a><span class="k">class</span> <span class="nc">FileBasedSink</span><span class="p">(</span><span class="n">iobase</span><span class="o">.</span><span class="n">Sink</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""A sink to a GCS or local files.</span> |
| |
| <span class="sd"> To implement a file-based sink, extend this class and override</span> |
| <span class="sd"> either :meth:`.write_record()` or :meth:`.write_encoded_record()`.</span> |
| |
| <span class="sd"> If needed, also overwrite :meth:`.open()` and/or :meth:`.close()` to customize</span> |
| <span class="sd"> the file handling or write headers and footers.</span> |
| |
| <span class="sd"> The output of this write is a :class:`~apache_beam.pvalue.PCollection` of</span> |
| <span class="sd"> all written shards.</span> |
| <span class="sd"> """</span> |
| |
| <span class="c1"># Max number of threads to be used for renaming.</span> |
| <span class="n">_MAX_RENAME_THREADS</span> <span class="o">=</span> <span class="mi">64</span> |
| <span class="fm">__hash__</span> <span class="o">=</span> <span class="kc">None</span> <span class="c1"># type: ignore[assignment]</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">file_path_prefix</span><span class="p">,</span> |
| <span class="n">coder</span><span class="p">,</span> |
| <span class="n">file_name_suffix</span><span class="o">=</span><span class="s1">''</span><span class="p">,</span> |
| <span class="n">num_shards</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> |
| <span class="n">shard_name_template</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> |
| <span class="n">mime_type</span><span class="o">=</span><span class="s1">'application/octet-stream'</span><span class="p">,</span> |
| <span class="n">compression_type</span><span class="o">=</span><span class="n">CompressionTypes</span><span class="o">.</span><span class="n">AUTO</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">max_records_per_shard</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> |
| <span class="n">max_bytes_per_shard</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> |
| <span class="n">skip_if_empty</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Raises:</span> |
| <span class="sd"> TypeError: if file path parameters are not a :class:`str` or</span> |
| <span class="sd"> :class:`~apache_beam.options.value_provider.ValueProvider`, or if</span> |
| <span class="sd"> **compression_type** is not member of</span> |
| <span class="sd"> :class:`~apache_beam.io.filesystem.CompressionTypes`.</span> |
| <span class="sd"> ValueError: if **shard_name_template** is not of expected</span> |
| <span class="sd"> format.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">file_path_prefix</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">ValueProvider</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s1">'file_path_prefix must be a string or ValueProvider;'</span> |
| <span class="s1">'got </span><span class="si">%r</span><span class="s1"> instead'</span> <span class="o">%</span> <span class="n">file_path_prefix</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">file_name_suffix</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">ValueProvider</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s1">'file_name_suffix must be a string or ValueProvider;'</span> |
| <span class="s1">'got </span><span class="si">%r</span><span class="s1"> instead'</span> <span class="o">%</span> <span class="n">file_name_suffix</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">CompressionTypes</span><span class="o">.</span><span class="n">is_valid_compression_type</span><span class="p">(</span><span class="n">compression_type</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s1">'compression_type must be CompressionType object but '</span> |
| <span class="s1">'was </span><span class="si">%s</span><span class="s1">'</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">compression_type</span><span class="p">))</span> |
| <span class="k">if</span> <span class="n">shard_name_template</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">shard_name_template</span> <span class="o">=</span> <span class="n">DEFAULT_SHARD_NAME_TEMPLATE</span> |
| <span class="k">elif</span> <span class="n">shard_name_template</span> <span class="o">==</span> <span class="s1">''</span><span class="p">:</span> |
| <span class="n">num_shards</span> <span class="o">=</span> <span class="mi">1</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">file_path_prefix</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">file_path_prefix</span> <span class="o">=</span> <span class="n">StaticValueProvider</span><span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">file_path_prefix</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">file_name_suffix</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">file_name_suffix</span> <span class="o">=</span> <span class="n">StaticValueProvider</span><span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">file_name_suffix</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">file_path_prefix</span> <span class="o">=</span> <span class="n">file_path_prefix</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">file_name_suffix</span> <span class="o">=</span> <span class="n">file_name_suffix</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">num_shards</span> <span class="o">=</span> <span class="n">num_shards</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">coder</span> <span class="o">=</span> <span class="n">coder</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">shard_name_format</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_template_to_format</span><span class="p">(</span><span class="n">shard_name_template</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">shard_name_glob_format</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_template_to_glob_format</span><span class="p">(</span> |
| <span class="n">shard_name_template</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">compression_type</span> <span class="o">=</span> <span class="n">compression_type</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">mime_type</span> <span class="o">=</span> <span class="n">mime_type</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">max_records_per_shard</span> <span class="o">=</span> <span class="n">max_records_per_shard</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">max_bytes_per_shard</span> <span class="o">=</span> <span class="n">max_bytes_per_shard</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">skip_if_empty</span> <span class="o">=</span> <span class="n">skip_if_empty</span> |
| |
| <div class="viewcode-block" id="FileBasedSink.display_data"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink.display_data">[docs]</a> <span class="k">def</span> <span class="nf">display_data</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="k">return</span> <span class="p">{</span> |
| <span class="s1">'shards'</span><span class="p">:</span> <span class="n">DisplayDataItem</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">num_shards</span><span class="p">,</span> |
| <span class="n">label</span><span class="o">=</span><span class="s1">'Number of Shards'</span><span class="p">)</span><span class="o">.</span><span class="n">drop_if_default</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> |
| <span class="s1">'compression'</span><span class="p">:</span> <span class="n">DisplayDataItem</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">compression_type</span><span class="p">)),</span> |
| <span class="s1">'file_pattern'</span><span class="p">:</span> <span class="n">DisplayDataItem</span><span class="p">(</span> |
| <span class="s1">'</span><span class="si">{}{}{}</span><span class="s1">'</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">file_path_prefix</span><span class="p">,</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">shard_name_format</span><span class="p">,</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">file_name_suffix</span><span class="p">),</span> |
| <span class="n">label</span><span class="o">=</span><span class="s1">'File Pattern'</span><span class="p">)</span> |
| <span class="p">}</span></div> |
| |
| <div class="viewcode-block" id="FileBasedSink.open"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink.open">[docs]</a> <span class="nd">@check_accessible</span><span class="p">([</span><span class="s1">'file_path_prefix'</span><span class="p">])</span> |
| <span class="k">def</span> <span class="nf">open</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">temp_path</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""Opens ``temp_path``, returning an opaque file handle object.</span> |
| |
| <span class="sd"> The returned file handle is passed to ``write_[encoded_]record`` and</span> |
| <span class="sd"> ``close``.</span> |
| <span class="sd"> """</span> |
| <span class="n">writer</span> <span class="o">=</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">create</span><span class="p">(</span> |
| <span class="n">temp_path</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">mime_type</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">compression_type</span><span class="p">)</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_bytes_per_shard</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">byte_counter</span> <span class="o">=</span> <span class="n">_ByteCountingWriter</span><span class="p">(</span><span class="n">writer</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">byte_counter</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">writer</span></div> |
| |
| <div class="viewcode-block" id="FileBasedSink.write_record"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink.write_record">[docs]</a> <span class="k">def</span> <span class="nf">write_record</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">file_handle</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""Writes a single record go the file handle returned by ``open()``.</span> |
| |
| <span class="sd"> By default, calls ``write_encoded_record`` after encoding the record with</span> |
| <span class="sd"> this sink's Coder.</span> |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">write_encoded_record</span><span class="p">(</span><span class="n">file_handle</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">coder</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="n">value</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="FileBasedSink.write_encoded_record"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink.write_encoded_record">[docs]</a> <span class="k">def</span> <span class="nf">write_encoded_record</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">file_handle</span><span class="p">,</span> <span class="n">encoded_value</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""Writes a single encoded record to the file handle returned by ``open()``.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span></div> |
| |
| <div class="viewcode-block" id="FileBasedSink.close"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink.close">[docs]</a> <span class="k">def</span> <span class="nf">close</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">file_handle</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""Finalize and close the file handle returned from ``open()``.</span> |
| |
| <span class="sd"> Called after all records are written.</span> |
| |
| <span class="sd"> By default, calls ``file_handle.close()`` iff it is not None.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">file_handle</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">file_handle</span><span class="o">.</span><span class="n">close</span><span class="p">()</span></div> |
| |
| <div class="viewcode-block" id="FileBasedSink.initialize_write"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink.initialize_write">[docs]</a> <span class="nd">@check_accessible</span><span class="p">([</span><span class="s1">'file_path_prefix'</span><span class="p">,</span> <span class="s1">'file_name_suffix'</span><span class="p">])</span> |
| <span class="k">def</span> <span class="nf">initialize_write</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="n">file_path_prefix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">file_path_prefix</span><span class="o">.</span><span class="n">get</span><span class="p">()</span> |
| |
| <span class="n">tmp_dir</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_create_temp_dir</span><span class="p">(</span><span class="n">file_path_prefix</span><span class="p">)</span> |
| <span class="n">FileSystems</span><span class="o">.</span><span class="n">mkdirs</span><span class="p">(</span><span class="n">tmp_dir</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">tmp_dir</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_temp_dir</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">file_path_prefix</span><span class="p">):</span> |
| <span class="n">base_path</span><span class="p">,</span> <span class="n">last_component</span> <span class="o">=</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">file_path_prefix</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">last_component</span><span class="p">:</span> |
| <span class="c1"># Trying to re-split the base_path to check if it's a root.</span> |
| <span class="n">new_base_path</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">base_path</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">base_path</span> <span class="o">==</span> <span class="n">new_base_path</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s1">'Cannot create a temporary directory for root path '</span> |
| <span class="s1">'prefix </span><span class="si">%s</span><span class="s1">. Please specify a file path prefix with '</span> |
| <span class="s1">'at least two components.'</span> <span class="o">%</span> <span class="n">file_path_prefix</span><span class="p">)</span> |
| <span class="n">path_components</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">base_path</span><span class="p">,</span> <span class="s1">'beam-temp-'</span> <span class="o">+</span> <span class="n">last_component</span> <span class="o">+</span> <span class="s1">'-'</span> <span class="o">+</span> <span class="n">uuid</span><span class="o">.</span><span class="n">uuid1</span><span class="p">()</span><span class="o">.</span><span class="n">hex</span> |
| <span class="p">]</span> |
| <span class="k">return</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="o">*</span><span class="n">path_components</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="FileBasedSink.open_writer"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink.open_writer">[docs]</a> <span class="nd">@check_accessible</span><span class="p">([</span><span class="s1">'file_path_prefix'</span><span class="p">,</span> <span class="s1">'file_name_suffix'</span><span class="p">])</span> |
| <span class="k">def</span> <span class="nf">open_writer</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">init_result</span><span class="p">,</span> <span class="n">uid</span><span class="p">):</span> |
| <span class="c1"># A proper suffix is needed for AUTO compression detection.</span> |
| <span class="c1"># We also ensure there will be no collisions with uid and a</span> |
| <span class="c1"># (possibly unsharded) file_path_prefix and a (possibly empty)</span> |
| <span class="c1"># file_name_suffix.</span> |
| <span class="n">file_path_prefix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">file_path_prefix</span><span class="o">.</span><span class="n">get</span><span class="p">()</span> |
| <span class="n">file_name_suffix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">file_name_suffix</span><span class="o">.</span><span class="n">get</span><span class="p">()</span> |
| <span class="n">suffix</span> <span class="o">=</span> <span class="p">(</span><span class="s1">'.'</span> <span class="o">+</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">basename</span><span class="p">(</span><span class="n">file_path_prefix</span><span class="p">)</span> <span class="o">+</span> <span class="n">file_name_suffix</span><span class="p">)</span> |
| <span class="n">writer_path</span> <span class="o">=</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">init_result</span><span class="p">,</span> <span class="n">uid</span><span class="p">)</span> <span class="o">+</span> <span class="n">suffix</span> |
| <span class="k">return</span> <span class="n">FileBasedSinkWriter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">writer_path</span><span class="p">)</span></div> |
| |
| <span class="nd">@check_accessible</span><span class="p">([</span><span class="s1">'file_path_prefix'</span><span class="p">,</span> <span class="s1">'file_name_suffix'</span><span class="p">])</span> |
| <span class="k">def</span> <span class="nf">_get_final_name</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">shard_num</span><span class="p">,</span> <span class="n">num_shards</span><span class="p">):</span> |
| <span class="k">return</span> <span class="s1">''</span><span class="o">.</span><span class="n">join</span><span class="p">([</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">file_path_prefix</span><span class="o">.</span><span class="n">get</span><span class="p">(),</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">shard_name_format</span> <span class="o">%</span> |
| <span class="nb">dict</span><span class="p">(</span><span class="n">shard_num</span><span class="o">=</span><span class="n">shard_num</span><span class="p">,</span> <span class="n">num_shards</span><span class="o">=</span><span class="n">num_shards</span><span class="p">),</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">file_name_suffix</span><span class="o">.</span><span class="n">get</span><span class="p">()</span> |
| <span class="p">])</span> |
| |
| <span class="nd">@check_accessible</span><span class="p">([</span><span class="s1">'file_path_prefix'</span><span class="p">,</span> <span class="s1">'file_name_suffix'</span><span class="p">])</span> |
| <span class="k">def</span> <span class="nf">_get_final_name_glob</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">num_shards</span><span class="p">):</span> |
| <span class="k">return</span> <span class="s1">''</span><span class="o">.</span><span class="n">join</span><span class="p">([</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">file_path_prefix</span><span class="o">.</span><span class="n">get</span><span class="p">(),</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">shard_name_glob_format</span> <span class="o">%</span> <span class="nb">dict</span><span class="p">(</span><span class="n">num_shards</span><span class="o">=</span><span class="n">num_shards</span><span class="p">),</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">file_name_suffix</span><span class="o">.</span><span class="n">get</span><span class="p">()</span> |
| <span class="p">])</span> |
| |
| <div class="viewcode-block" id="FileBasedSink.pre_finalize"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink.pre_finalize">[docs]</a> <span class="k">def</span> <span class="nf">pre_finalize</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">init_result</span><span class="p">,</span> <span class="n">writer_results</span><span class="p">):</span> |
| <span class="n">num_shards</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">writer_results</span><span class="p">))</span> |
| <span class="n">dst_glob</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_final_name_glob</span><span class="p">(</span><span class="n">num_shards</span><span class="p">)</span> |
| <span class="n">dst_glob_files</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">file_metadata</span><span class="o">.</span><span class="n">path</span> <span class="k">for</span> <span class="n">mr</span> <span class="ow">in</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">match</span><span class="p">([</span><span class="n">dst_glob</span><span class="p">])</span> |
| <span class="k">for</span> <span class="n">file_metadata</span> <span class="ow">in</span> <span class="n">mr</span><span class="o">.</span><span class="n">metadata_list</span> |
| <span class="p">]</span> |
| |
| <span class="k">if</span> <span class="n">dst_glob_files</span><span class="p">:</span> |
| <span class="n">_LOGGER</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span> |
| <span class="s1">'Deleting </span><span class="si">%d</span><span class="s1"> existing files in target path matching: </span><span class="si">%s</span><span class="s1">'</span><span class="p">,</span> |
| <span class="nb">len</span><span class="p">(</span><span class="n">dst_glob_files</span><span class="p">),</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">shard_name_glob_format</span><span class="p">)</span> |
| <span class="n">FileSystems</span><span class="o">.</span><span class="n">delete</span><span class="p">(</span><span class="n">dst_glob_files</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_check_state_for_finalize_write</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">writer_results</span><span class="p">,</span> <span class="n">num_shards</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""Checks writer output files' states.</span> |
| |
| <span class="sd"> Returns:</span> |
| <span class="sd"> src_files, dst_files: Lists of files to rename. For each i, finalize_write</span> |
| <span class="sd"> should rename(src_files[i], dst_files[i]).</span> |
| <span class="sd"> delete_files: Src files to delete. These could be leftovers from an</span> |
| <span class="sd"> incomplete (non-atomic) rename operation.</span> |
| <span class="sd"> num_skipped: Tally of writer results files already renamed, such as from</span> |
| <span class="sd"> a previous run of finalize_write().</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">writer_results</span><span class="p">:</span> |
| <span class="k">return</span> <span class="p">[],</span> <span class="p">[],</span> <span class="p">[],</span> <span class="mi">0</span> |
| |
| <span class="n">src_glob</span> <span class="o">=</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">FileSystems</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">writer_results</span><span class="p">[</span><span class="mi">0</span><span class="p">])[</span><span class="mi">0</span><span class="p">],</span> <span class="s1">'*'</span><span class="p">)</span> |
| <span class="n">dst_glob</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_final_name_glob</span><span class="p">(</span><span class="n">num_shards</span><span class="p">)</span> |
| <span class="n">src_glob_files</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span> |
| <span class="n">file_metadata</span><span class="o">.</span><span class="n">path</span> <span class="k">for</span> <span class="n">mr</span> <span class="ow">in</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">match</span><span class="p">([</span><span class="n">src_glob</span><span class="p">])</span> |
| <span class="k">for</span> <span class="n">file_metadata</span> <span class="ow">in</span> <span class="n">mr</span><span class="o">.</span><span class="n">metadata_list</span><span class="p">)</span> |
| <span class="n">dst_glob_files</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span> |
| <span class="n">file_metadata</span><span class="o">.</span><span class="n">path</span> <span class="k">for</span> <span class="n">mr</span> <span class="ow">in</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">match</span><span class="p">([</span><span class="n">dst_glob</span><span class="p">])</span> |
| <span class="k">for</span> <span class="n">file_metadata</span> <span class="ow">in</span> <span class="n">mr</span><span class="o">.</span><span class="n">metadata_list</span><span class="p">)</span> |
| |
| <span class="n">src_files</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">dst_files</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">delete_files</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">num_skipped</span> <span class="o">=</span> <span class="mi">0</span> |
| <span class="k">for</span> <span class="n">shard_num</span><span class="p">,</span> <span class="n">src</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">writer_results</span><span class="p">):</span> |
| <span class="n">final_name</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_get_final_name</span><span class="p">(</span><span class="n">shard_num</span><span class="p">,</span> <span class="n">num_shards</span><span class="p">)</span> |
| <span class="n">dst</span> <span class="o">=</span> <span class="n">final_name</span> |
| <span class="n">src_exists</span> <span class="o">=</span> <span class="n">src</span> <span class="ow">in</span> <span class="n">src_glob_files</span> |
| <span class="n">dst_exists</span> <span class="o">=</span> <span class="n">dst</span> <span class="ow">in</span> <span class="n">dst_glob_files</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">src_exists</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">dst_exists</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">BeamIOError</span><span class="p">(</span> |
| <span class="s1">'src and dst files do not exist. src: </span><span class="si">%s</span><span class="s1">, dst: </span><span class="si">%s</span><span class="s1">'</span> <span class="o">%</span> <span class="p">(</span><span class="n">src</span><span class="p">,</span> <span class="n">dst</span><span class="p">))</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">src_exists</span> <span class="ow">and</span> <span class="n">dst_exists</span><span class="p">:</span> |
| <span class="n">_LOGGER</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="s1">'src: </span><span class="si">%s</span><span class="s1"> -> dst: </span><span class="si">%s</span><span class="s1"> already renamed, skipping'</span><span class="p">,</span> <span class="n">src</span><span class="p">,</span> <span class="n">dst</span><span class="p">)</span> |
| <span class="n">num_skipped</span> <span class="o">+=</span> <span class="mi">1</span> |
| <span class="k">continue</span> |
| <span class="k">if</span> <span class="p">(</span><span class="n">src_exists</span> <span class="ow">and</span> <span class="n">dst_exists</span> <span class="ow">and</span> |
| <span class="n">FileSystems</span><span class="o">.</span><span class="n">checksum</span><span class="p">(</span><span class="n">src</span><span class="p">)</span> <span class="o">==</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">checksum</span><span class="p">(</span><span class="n">dst</span><span class="p">)):</span> |
| <span class="n">_LOGGER</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="s1">'src: </span><span class="si">%s</span><span class="s1"> == dst: </span><span class="si">%s</span><span class="s1">, deleting src'</span><span class="p">,</span> <span class="n">src</span><span class="p">,</span> <span class="n">dst</span><span class="p">)</span> |
| <span class="n">delete_files</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">src</span><span class="p">)</span> |
| <span class="k">continue</span> |
| |
| <span class="n">src_files</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">src</span><span class="p">)</span> |
| <span class="n">dst_files</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">dst</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">src_files</span><span class="p">,</span> <span class="n">dst_files</span><span class="p">,</span> <span class="n">delete_files</span><span class="p">,</span> <span class="n">num_skipped</span> |
| |
| <div class="viewcode-block" id="FileBasedSink.finalize_write"><a class="viewcode-back" href="../../../apache_beam.io.filebasedsink.html#apache_beam.io.filebasedsink.FileBasedSink.finalize_write">[docs]</a> <span class="nd">@check_accessible</span><span class="p">([</span><span class="s1">'file_path_prefix'</span><span class="p">])</span> |
| <span class="k">def</span> <span class="nf">finalize_write</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">init_result</span><span class="p">,</span> <span class="n">writer_results</span><span class="p">,</span> <span class="n">unused_pre_finalize_results</span><span class="p">):</span> |
| <span class="n">writer_results</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">writer_results</span><span class="p">)</span> |
| <span class="n">num_shards</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">writer_results</span><span class="p">)</span> |
| |
| <span class="n">src_files</span><span class="p">,</span> <span class="n">dst_files</span><span class="p">,</span> <span class="n">delete_files</span><span class="p">,</span> <span class="n">num_skipped</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_check_state_for_finalize_write</span><span class="p">(</span><span class="n">writer_results</span><span class="p">,</span> <span class="n">num_shards</span><span class="p">))</span> |
| <span class="n">num_skipped</span> <span class="o">+=</span> <span class="nb">len</span><span class="p">(</span><span class="n">delete_files</span><span class="p">)</span> |
| <span class="n">FileSystems</span><span class="o">.</span><span class="n">delete</span><span class="p">(</span><span class="n">delete_files</span><span class="p">)</span> |
| <span class="n">num_shards_to_finalize</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">src_files</span><span class="p">)</span> |
| <span class="n">min_threads</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">num_shards_to_finalize</span><span class="p">,</span> <span class="n">FileBasedSink</span><span class="o">.</span><span class="n">_MAX_RENAME_THREADS</span><span class="p">)</span> |
| <span class="n">num_threads</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">min_threads</span><span class="p">)</span> |
| |
| <span class="n">chunk_size</span> <span class="o">=</span> <span class="n">FileSystems</span><span class="o">.</span><span class="n">get_chunk_size</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">file_path_prefix</span><span class="o">.</span><span class="n">get</span><span class="p">())</span> |
| <span class="n">source_file_batch</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">src_files</span><span class="p">[</span><span class="n">i</span><span class="p">:</span><span class="n">i</span> <span class="o">+</span> <span class="n">chunk_size</span><span class="p">]</span> |
| <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">src_files</span><span class="p">),</span> <span class="n">chunk_size</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="n">destination_file_batch</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">dst_files</span><span class="p">[</span><span class="n">i</span><span class="p">:</span><span class="n">i</span> <span class="o">+</span> <span class="n">chunk_size</span><span class="p">]</span> |
| <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">dst_files</span><span class="p">),</span> <span class="n">chunk_size</span><span class="p">)</span> |
| <span class="p">]</span> |
| |
| <span class="k">if</span> <span class="n">num_shards_to_finalize</span><span class="p">:</span> |
| <span class="n">_LOGGER</span><span class="o">.</span><span class="n">info</span><span class="p">(</span> |
| <span class="s1">'Starting finalize_write threads with num_shards: </span><span class="si">%d</span><span class="s1"> (skipped: </span><span class="si">%d</span><span class="s1">), '</span> |
| <span class="s1">'batches: </span><span class="si">%d</span><span class="s1">, num_threads: </span><span class="si">%d</span><span class="s1">'</span><span class="p">,</span> |
| <span class="n">num_shards_to_finalize</span><span class="p">,</span> |
| <span class="n">num_skipped</span><span class="p">,</span> |
| <span class="nb">len</span><span class="p">(</span><span class="n">source_file_batch</span><span class="p">),</span> |
| <span class="n">num_threads</span><span class="p">)</span> |
| <span class="n">start_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span> |
| |
| <span class="c1"># Use a thread pool for renaming operations.</span> |
| <span class="k">def</span> <span class="nf">_rename_batch</span><span class="p">(</span><span class="n">batch</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""_rename_batch executes batch rename operations."""</span> |
| <span class="n">source_files</span><span class="p">,</span> <span class="n">destination_files</span> <span class="o">=</span> <span class="n">batch</span> |
| <span class="n">exceptions</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="n">FileSystems</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">source_files</span><span class="p">,</span> <span class="n">destination_files</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">exceptions</span> |
| <span class="k">except</span> <span class="n">BeamIOError</span> <span class="k">as</span> <span class="n">exp</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">exp</span><span class="o">.</span><span class="n">exception_details</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> |
| <span class="k">for</span> <span class="p">(</span><span class="n">src</span><span class="p">,</span> <span class="n">dst</span><span class="p">),</span> <span class="n">exception</span> <span class="ow">in</span> <span class="n">exp</span><span class="o">.</span><span class="n">exception_details</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> |
| <span class="k">if</span> <span class="n">exception</span><span class="p">:</span> |
| <span class="n">_LOGGER</span><span class="o">.</span><span class="n">error</span><span class="p">(</span> |
| <span class="p">(</span><span class="s1">'Exception in _rename_batch. src: </span><span class="si">%s</span><span class="s1">, '</span> |
| <span class="s1">'dst: </span><span class="si">%s</span><span class="s1">, err: </span><span class="si">%s</span><span class="s1">'</span><span class="p">),</span> |
| <span class="n">src</span><span class="p">,</span> |
| <span class="n">dst</span><span class="p">,</span> |
| <span class="n">exception</span><span class="p">)</span> |
| <span class="n">exceptions</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">exception</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">_LOGGER</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="s1">'Rename successful: </span><span class="si">%s</span><span class="s1"> -> </span><span class="si">%s</span><span class="s1">'</span><span class="p">,</span> <span class="n">src</span><span class="p">,</span> <span class="n">dst</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">exceptions</span> |
| |
| <span class="n">exception_batches</span> <span class="o">=</span> <span class="n">util</span><span class="o">.</span><span class="n">run_using_threadpool</span><span class="p">(</span> |
| <span class="n">_rename_batch</span><span class="p">,</span> |
| <span class="nb">list</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span><span class="n">source_file_batch</span><span class="p">,</span> <span class="n">destination_file_batch</span><span class="p">)),</span> |
| <span class="n">num_threads</span><span class="p">)</span> |
| |
| <span class="n">all_exceptions</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">e</span> <span class="k">for</span> <span class="n">exception_batch</span> <span class="ow">in</span> <span class="n">exception_batches</span> <span class="k">for</span> <span class="n">e</span> <span class="ow">in</span> <span class="n">exception_batch</span> |
| <span class="p">]</span> |
| <span class="k">if</span> <span class="n">all_exceptions</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">Exception</span><span class="p">(</span> |
| <span class="s1">'Encountered exceptions in finalize_write: </span><span class="si">%s</span><span class="s1">'</span> <span class="o">%</span> <span class="n">all_exceptions</span><span class="p">)</span> |
| |
| <span class="k">yield from</span> <span class="n">dst_files</span> |
| |
| <span class="n">_LOGGER</span><span class="o">.</span><span class="n">info</span><span class="p">(</span> |
| <span class="s1">'Renamed </span><span class="si">%d</span><span class="s1"> shards in </span><span class="si">%.2f</span><span class="s1"> seconds.'</span><span class="p">,</span> |
| <span class="n">num_shards_to_finalize</span><span class="p">,</span> |
| <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span> <span class="o">-</span> <span class="n">start_time</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">_LOGGER</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span> |
| <span class="s1">'No shards found to finalize. num_shards: </span><span class="si">%d</span><span class="s1">, skipped: </span><span class="si">%d</span><span class="s1">'</span><span class="p">,</span> |
| <span class="n">num_shards</span><span class="p">,</span> |
| <span class="n">num_skipped</span><span class="p">)</span> |
| |
| <span class="k">try</span><span class="p">:</span> |
| <span class="n">FileSystems</span><span class="o">.</span><span class="n">delete</span><span class="p">([</span><span class="n">init_result</span><span class="p">])</span> |
| <span class="k">except</span> <span class="ne">IOError</span><span class="p">:</span> |
| <span class="c1"># This error is not serious, we simply log it.</span> |
| <span class="n">_LOGGER</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s1">'Unable to delete file: </span><span class="si">%s</span><span class="s1">'</span><span class="p">,</span> <span class="n">init_result</span><span class="p">)</span></div> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_template_replace_num_shards</span><span class="p">(</span><span class="n">shard_name_template</span><span class="p">):</span> |
| <span class="n">match</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="s1">'N+'</span><span class="p">,</span> <span class="n">shard_name_template</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">match</span><span class="p">:</span> |
| <span class="n">shard_name_template</span> <span class="o">=</span> <span class="n">shard_name_template</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span> |
| <span class="k">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="s1">'</span><span class="si">%%</span><span class="s1">(num_shards)0</span><span class="si">%d</span><span class="s1">d'</span> <span class="o">%</span> <span class="nb">len</span><span class="p">(</span><span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">0</span><span class="p">)))</span> |
| <span class="k">return</span> <span class="n">shard_name_template</span> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_template_to_format</span><span class="p">(</span><span class="n">shard_name_template</span><span class="p">):</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">shard_name_template</span><span class="p">:</span> |
| <span class="k">return</span> <span class="s1">''</span> |
| <span class="n">match</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="s1">'S+'</span><span class="p">,</span> <span class="n">shard_name_template</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">match</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Shard number pattern S+ not found in shard_name_template: </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> |
| <span class="n">shard_name_template</span><span class="p">)</span> |
| <span class="n">shard_name_format</span> <span class="o">=</span> <span class="n">shard_name_template</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span> |
| <span class="k">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="s1">'</span><span class="si">%%</span><span class="s1">(shard_num)0</span><span class="si">%d</span><span class="s1">d'</span> <span class="o">%</span> <span class="nb">len</span><span class="p">(</span><span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">0</span><span class="p">)))</span> |
| <span class="k">return</span> <span class="n">FileBasedSink</span><span class="o">.</span><span class="n">_template_replace_num_shards</span><span class="p">(</span><span class="n">shard_name_format</span><span class="p">)</span> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_template_to_glob_format</span><span class="p">(</span><span class="n">shard_name_template</span><span class="p">):</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">shard_name_template</span><span class="p">:</span> |
| <span class="k">return</span> <span class="s1">''</span> |
| <span class="n">match</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="s1">'S+'</span><span class="p">,</span> <span class="n">shard_name_template</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">match</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"Shard number pattern S+ not found in shard_name_template: </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> |
| <span class="n">shard_name_template</span><span class="p">)</span> |
| <span class="n">shard_name_format</span> <span class="o">=</span> <span class="n">shard_name_template</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="n">match</span><span class="o">.</span><span class="n">group</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="s1">'*'</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">FileBasedSink</span><span class="o">.</span><span class="n">_template_replace_num_shards</span><span class="p">(</span><span class="n">shard_name_format</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__eq__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="c1"># TODO: Clean up workitem_test which uses this.</span> |
| <span class="c1"># pylint: disable=unidiomatic-typecheck</span> |
| <span class="k">return</span> <span class="nb">type</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">==</span> <span class="nb">type</span><span class="p">(</span><span class="n">other</span><span class="p">)</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="vm">__dict__</span> <span class="o">==</span> <span class="n">other</span><span class="o">.</span><span class="vm">__dict__</span></div> |
| |
| |
| <span class="k">class</span> <span class="nc">FileBasedSinkWriter</span><span class="p">(</span><span class="n">iobase</span><span class="o">.</span><span class="n">Writer</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""The writer for FileBasedSink.</span> |
| <span class="sd"> """</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">sink</span><span class="p">,</span> <span class="n">temp_shard_path</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">sink</span> <span class="o">=</span> <span class="n">sink</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">temp_shard_path</span> <span class="o">=</span> <span class="n">temp_shard_path</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">temp_handle</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">sink</span><span class="o">.</span><span class="n">open</span><span class="p">(</span><span class="n">temp_shard_path</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">num_records_written</span> <span class="o">=</span> <span class="mi">0</span> |
| |
| <span class="k">def</span> <span class="nf">write</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">num_records_written</span> <span class="o">+=</span> <span class="mi">1</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">sink</span><span class="o">.</span><span class="n">write_record</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">temp_handle</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">at_capacity</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="k">return</span> <span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">sink</span><span class="o">.</span><span class="n">max_records_per_shard</span> <span class="ow">and</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">num_records_written</span> <span class="o">>=</span> <span class="bp">self</span><span class="o">.</span><span class="n">sink</span><span class="o">.</span><span class="n">max_records_per_shard</span> |
| <span class="p">)</span> <span class="ow">or</span> <span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">sink</span><span class="o">.</span><span class="n">max_bytes_per_shard</span> <span class="ow">and</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">sink</span><span class="o">.</span><span class="n">byte_counter</span><span class="o">.</span><span class="n">bytes_written</span> <span class="o">>=</span> <span class="bp">self</span><span class="o">.</span><span class="n">sink</span><span class="o">.</span><span class="n">max_bytes_per_shard</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">close</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">sink</span><span class="o">.</span><span class="n">close</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">temp_handle</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">temp_shard_path</span> |
| |
| |
| <span class="k">class</span> <span class="nc">_ByteCountingWriter</span><span class="p">:</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">writer</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">writer</span> <span class="o">=</span> <span class="n">writer</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">bytes_written</span> <span class="o">=</span> <span class="mi">0</span> |
| |
| <span class="k">def</span> <span class="nf">write</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">bs</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">bytes_written</span> <span class="o">+=</span> <span class="nb">len</span><span class="p">(</span><span class="n">bs</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">writer</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">bs</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">flush</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">writer</span><span class="o">.</span><span class="n">flush</span><span class="p">()</span> |
| |
| <span class="k">def</span> <span class="nf">close</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">writer</span><span class="o">.</span><span class="n">close</span><span class="p">()</span> |
| </pre></div> |
| |
| </div> |
| |
| </div> |
| <footer> |
| |
| |
| <hr/> |
| |
| <div role="contentinfo"> |
| <p> |
| © Copyright |
| |
| </p> |
| </div> |
| Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. |
| |
| </footer> |
| |
| </div> |
| </div> |
| |
| </section> |
| |
| </div> |
| |
| |
| |
| <script type="text/javascript"> |
| jQuery(function () { |
| SphinxRtdTheme.Navigation.enable(true); |
| }); |
| </script> |
| |
| |
| |
| |
| |
| |
| </body> |
| </html> |