blob: 31a44a4df362b6bf0f1bdc74fe262e3638cd9cce [file] [log] [blame]
<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>apache_beam.runners.interactive.interactive_beam module &mdash; Apache Beam 2.47.0 documentation</title>
<script type="text/javascript" src="_static/js/modernizr.min.js"></script>
<script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
<script type="text/javascript" src="_static/jquery.js"></script>
<script type="text/javascript" src="_static/underscore.js"></script>
<script type="text/javascript" src="_static/doctools.js"></script>
<script type="text/javascript" src="_static/language_data.js"></script>
<script async="async" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/javascript" src="_static/js/theme.js"></script>
<link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
<link rel="stylesheet" href="_static/pygments.css" type="text/css" />
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="apache_beam.runners.interactive.interactive_environment module" href="apache_beam.runners.interactive.interactive_environment.html" />
<link rel="prev" title="apache_beam.runners.interactive.cache_manager module" href="apache_beam.runners.interactive.cache_manager.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="index.html" class="icon icon-home"> Apache Beam
</a>
<div class="version">
2.47.0
</div>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
<input type="text" name="q" placeholder="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="apache_beam.coders.html">apache_beam.coders package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.dataframe.html">apache_beam.dataframe package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.io.html">apache_beam.io package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.metrics.html">apache_beam.metrics package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.ml.html">apache_beam.ml package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.options.html">apache_beam.options package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.portability.html">apache_beam.portability package</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="apache_beam.runners.html">apache_beam.runners package</a><ul class="current">
<li class="toctree-l2 current"><a class="reference internal" href="apache_beam.runners.html#subpackages">Subpackages</a><ul class="current">
<li class="toctree-l3"><a class="reference internal" href="apache_beam.runners.dask.html">apache_beam.runners.dask package</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.runners.dataflow.html">apache_beam.runners.dataflow package</a></li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.runners.direct.html">apache_beam.runners.direct package</a></li>
<li class="toctree-l3 current"><a class="reference internal" href="apache_beam.runners.interactive.html">apache_beam.runners.interactive package</a><ul class="current">
<li class="toctree-l4"><a class="reference internal" href="apache_beam.runners.interactive.html#subpackages">Subpackages</a></li>
<li class="toctree-l4 current"><a class="reference internal" href="apache_beam.runners.interactive.html#submodules">Submodules</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="apache_beam.runners.job.html">apache_beam.runners.job package</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="apache_beam.runners.html#submodules">Submodules</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.testing.html">apache_beam.testing package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.transforms.html">apache_beam.transforms package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.typehints.html">apache_beam.typehints package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.utils.html">apache_beam.utils package</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.yaml.html">apache_beam.yaml package</a></li>
</ul>
<ul>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.error.html">apache_beam.error module</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.pipeline.html">apache_beam.pipeline module</a></li>
<li class="toctree-l1"><a class="reference internal" href="apache_beam.pvalue.html">apache_beam.pvalue module</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" aria-label="top navigation">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="index.html">Apache Beam</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="index.html">Docs</a> &raquo;</li>
<li><a href="apache_beam.runners.html">apache_beam.runners package</a> &raquo;</li>
<li><a href="apache_beam.runners.interactive.html">apache_beam.runners.interactive package</a> &raquo;</li>
<li>apache_beam.runners.interactive.interactive_beam module</li>
<li class="wy-breadcrumbs-aside">
<a href="_sources/apache_beam.runners.interactive.interactive_beam.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<div class="section" id="module-apache_beam.runners.interactive.interactive_beam">
<span id="apache-beam-runners-interactive-interactive-beam-module"></span><h1>apache_beam.runners.interactive.interactive_beam module<a class="headerlink" href="#module-apache_beam.runners.interactive.interactive_beam" title="Permalink to this headline"></a></h1>
<p>Module of Interactive Beam features that can be used in notebook.</p>
<p>The purpose of the module is to reduce the learning curve of Interactive Beam
users, provide a single place for importing and add sugar syntax for all
Interactive Beam components. It gives users capability to interact with existing
environment/session/context for Interactive Beam and visualize PCollections as
bounded dataset. In the meantime, it hides the interactivity implementation
from users so that users can focus on developing Beam pipeline without worrying
about how hidden states in the interactive session are managed.</p>
<dl class="docutils">
<dt>A convention to import this module:</dt>
<dd>from apache_beam.runners.interactive import interactive_beam as ib</dd>
</dl>
<p>Note: If you want backward-compatibility, only invoke interfaces provided by
this module in your notebook or application code.</p>
<dl class="class">
<dt id="apache_beam.runners.interactive.interactive_beam.Options">
<em class="property">class </em><code class="descclassname">apache_beam.runners.interactive.interactive_beam.</code><code class="descname">Options</code><a class="reference internal" href="_modules/apache_beam/runners/interactive/interactive_beam.html#Options"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Options" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="apache_beam.runners.interactive.options.interactive_options.html#apache_beam.runners.interactive.options.interactive_options.InteractiveOptions" title="apache_beam.runners.interactive.options.interactive_options.InteractiveOptions"><code class="xref py py-class docutils literal notranslate"><span class="pre">apache_beam.runners.interactive.options.interactive_options.InteractiveOptions</span></code></a></p>
<p>Options that guide how Interactive Beam works.</p>
<dl class="attribute">
<dt id="apache_beam.runners.interactive.interactive_beam.Options.enable_recording_replay">
<code class="descname">enable_recording_replay</code><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Options.enable_recording_replay" title="Permalink to this definition"></a></dt>
<dd><p>Whether replayable source data recorded should be replayed for multiple
PCollection evaluations and pipeline runs as long as the data recorded is
still valid.</p>
</dd></dl>
<dl class="attribute">
<dt id="apache_beam.runners.interactive.interactive_beam.Options.recordable_sources">
<code class="descname">recordable_sources</code><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Options.recordable_sources" title="Permalink to this definition"></a></dt>
<dd><p>Interactive Beam automatically records data from sources in this set.</p>
</dd></dl>
<dl class="attribute">
<dt id="apache_beam.runners.interactive.interactive_beam.Options.recording_duration">
<code class="descname">recording_duration</code><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Options.recording_duration" title="Permalink to this definition"></a></dt>
<dd><p>The data recording of sources ends as soon as the background source
recording job has run for this long.</p>
</dd></dl>
<dl class="attribute">
<dt id="apache_beam.runners.interactive.interactive_beam.Options.recording_size_limit">
<code class="descname">recording_size_limit</code><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Options.recording_size_limit" title="Permalink to this definition"></a></dt>
<dd><p>The data recording of sources ends as soon as the size (in bytes) of data
recorded from recordable sources reaches the limit.</p>
</dd></dl>
<dl class="attribute">
<dt id="apache_beam.runners.interactive.interactive_beam.Options.display_timestamp_format">
<code class="descname">display_timestamp_format</code><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Options.display_timestamp_format" title="Permalink to this definition"></a></dt>
<dd><p>The format in which timestamps are displayed.</p>
<p>Default is ‘%Y-%m-%d %H:%M:%S.%f%z’, e.g. 2020-02-01 15:05:06.000015-08:00.</p>
</dd></dl>
<dl class="attribute">
<dt id="apache_beam.runners.interactive.interactive_beam.Options.display_timezone">
<code class="descname">display_timezone</code><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Options.display_timezone" title="Permalink to this definition"></a></dt>
<dd><p>The timezone in which timestamps are displayed.</p>
<p>Defaults to local timezone.</p>
</dd></dl>
<dl class="attribute">
<dt id="apache_beam.runners.interactive.interactive_beam.Options.cache_root">
<code class="descname">cache_root</code><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Options.cache_root" title="Permalink to this definition"></a></dt>
<dd><p>The cache directory specified by the user.</p>
<p>Defaults to None.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="apache_beam.runners.interactive.interactive_beam.Recordings">
<em class="property">class </em><code class="descclassname">apache_beam.runners.interactive.interactive_beam.</code><code class="descname">Recordings</code><a class="reference internal" href="_modules/apache_beam/runners/interactive/interactive_beam.html#Recordings"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Recordings" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference external" href="https://docs.python.org/3/library/functions.html#object" title="(in Python v3.11)"><code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></a></p>
<p>An introspection interface for recordings for pipelines.</p>
<p>When a user materializes a PCollection onto disk (eg. ib.show) for a streaming
pipeline, a background source recording job is started. This job pulls data
from all defined unbounded sources for that PCollection’s pipeline. The
following methods allow for introspection into that background recording job.</p>
<dl class="method">
<dt id="apache_beam.runners.interactive.interactive_beam.Recordings.describe">
<code class="descname">describe</code><span class="sig-paren">(</span><em>pipeline=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/runners/interactive/interactive_beam.html#Recordings.describe"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Recordings.describe" title="Permalink to this definition"></a></dt>
<dd><p>Returns a description of all the recordings for the given pipeline.</p>
<p>If no pipeline is given then this returns a dictionary of descriptions for
all pipelines.</p>
</dd></dl>
<dl class="method">
<dt id="apache_beam.runners.interactive.interactive_beam.Recordings.clear">
<code class="descname">clear</code><span class="sig-paren">(</span><em>pipeline</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/runners/interactive/interactive_beam.html#Recordings.clear"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Recordings.clear" title="Permalink to this definition"></a></dt>
<dd><p>Clears all recordings of the given pipeline. Returns True if cleared.</p>
</dd></dl>
<dl class="method">
<dt id="apache_beam.runners.interactive.interactive_beam.Recordings.stop">
<code class="descname">stop</code><span class="sig-paren">(</span><em>pipeline</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/runners/interactive/interactive_beam.html#Recordings.stop"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Recordings.stop" title="Permalink to this definition"></a></dt>
<dd><p>Stops the background source recording of the given pipeline.</p>
</dd></dl>
<dl class="method">
<dt id="apache_beam.runners.interactive.interactive_beam.Recordings.record">
<code class="descname">record</code><span class="sig-paren">(</span><em>pipeline</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/runners/interactive/interactive_beam.html#Recordings.record"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Recordings.record" title="Permalink to this definition"></a></dt>
<dd><p>Starts a background source recording job for the given pipeline. Returns
True if the recording job was started.</p>
</dd></dl>
</dd></dl>
<dl class="class">
<dt id="apache_beam.runners.interactive.interactive_beam.Clusters">
<em class="property">class </em><code class="descclassname">apache_beam.runners.interactive.interactive_beam.</code><code class="descname">Clusters</code><a class="reference internal" href="_modules/apache_beam/runners/interactive/interactive_beam.html#Clusters"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Clusters" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference external" href="https://docs.python.org/3/library/functions.html#object" title="(in Python v3.11)"><code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></a></p>
<p>An interface to control clusters implicitly created and managed by
the current interactive environment. This class is not needed and
should not be used otherwise.</p>
<p>Do not use it for clusters a user explicitly manages: e.g., if you have
a Flink cluster running somewhere and provides the flink master when
running a pipeline with the FlinkRunner, the cluster will not be tracked
or managed by Beam.
To reuse the same cluster for your pipelines, use the same pipeline
options: e.g., a pipeline option with the same flink master if you are
using FlinkRunner.</p>
<p>This module is experimental. No backwards-compatibility guarantees.</p>
<p>Interactive Beam automatically creates/reuses existing worker clusters to
execute pipelines when it detects the need from configurations.
Currently, the only supported cluster implementation is Flink running on
Cloud Dataproc.</p>
<p>To configure a pipeline to run on Cloud Dataproc with Flink, set the
underlying runner of the InteractiveRunner to FlinkRunner and the pipeline
options to indicate where on Cloud the FlinkRunner should be deployed to.</p>
<blockquote>
<div><p>An example to enable automatic Dataproc cluster creation/reuse:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">options</span> <span class="o">=</span> <span class="n">PipelineOptions</span><span class="p">([</span>
<span class="s1">&#39;--project=my-project&#39;</span><span class="p">,</span>
<span class="s1">&#39;--region=my-region&#39;</span><span class="p">,</span>
<span class="s1">&#39;--environment_type=DOCKER&#39;</span><span class="p">])</span>
<span class="n">pipeline</span> <span class="o">=</span> <span class="n">beam</span><span class="o">.</span><span class="n">Pipeline</span><span class="p">(</span><span class="n">InteractiveRunner</span><span class="p">(</span>
<span class="n">underlying_runner</span><span class="o">=</span><span class="n">FlinkRunner</span><span class="p">()),</span> <span class="n">options</span><span class="o">=</span><span class="n">options</span><span class="p">)</span>
</pre></div>
</div>
</div></blockquote>
<p>Reuse a pipeline options in another pipeline would configure Interactive Beam
to reuse the same Dataproc cluster implicitly managed by the current
interactive environment.
If a flink_master is identified as a known cluster, the corresponding cluster
is also resued.
Furthermore, if a cluster is explicitly created by using a pipeline as an
identifier to a known cluster, the cluster is reused.</p>
<blockquote>
<div><p>An example:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># If pipeline runs on a known cluster, below code reuses the cluster</span>
<span class="c1"># manager without creating a new one.</span>
<span class="n">dcm</span> <span class="o">=</span> <span class="n">ib</span><span class="o">.</span><span class="n">clusters</span><span class="o">.</span><span class="n">create</span><span class="p">(</span><span class="n">pipeline</span><span class="p">)</span>
</pre></div>
</div>
</div></blockquote>
<p>To provision the cluster, use WorkerOptions. Supported configurations are:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="mf">1.</span> <span class="n">subnetwork</span>
<span class="mf">2.</span> <span class="n">num_workers</span>
<span class="mf">3.</span> <span class="n">machine_type</span>
</pre></div>
</div>
<p>To configure a pipeline to run on an existing FlinkRunner deployed elsewhere,
set the flink_master explicitly so no cluster will be created/reused.</p>
<blockquote>
<div><p>An example pipeline options to skip automatic Dataproc cluster usage:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">options</span> <span class="o">=</span> <span class="n">PipelineOptions</span><span class="p">([</span>
<span class="s1">&#39;--flink_master=some.self.hosted.flink:port&#39;</span><span class="p">,</span>
<span class="s1">&#39;--environment_type=DOCKER&#39;</span><span class="p">])</span>
</pre></div>
</div>
</div></blockquote>
<p>To configure a pipeline to run on a local FlinkRunner, explicitly set the
default cluster metadata to None: ib.clusters.set_default_cluster(None).</p>
<dl class="attribute">
<dt id="apache_beam.runners.interactive.interactive_beam.Clusters.DATAPROC_FLINK_VERSION">
<code class="descname">DATAPROC_FLINK_VERSION</code><em class="property"> = '1.12'</em><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Clusters.DATAPROC_FLINK_VERSION" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="attribute">
<dt id="apache_beam.runners.interactive.interactive_beam.Clusters.DATAPROC_MINIMUM_WORKER_NUM">
<code class="descname">DATAPROC_MINIMUM_WORKER_NUM</code><em class="property"> = 2</em><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Clusters.DATAPROC_MINIMUM_WORKER_NUM" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
<dl class="method">
<dt id="apache_beam.runners.interactive.interactive_beam.Clusters.create">
<code class="descname">create</code><span class="sig-paren">(</span><em>cluster_identifier: Union[str, apache_beam.pipeline.Pipeline, apache_beam.runners.interactive.dataproc.types.ClusterMetadata]</em><span class="sig-paren">)</span> &#x2192; apache_beam.runners.interactive.dataproc.dataproc_cluster_manager.DataprocClusterManager<a class="reference internal" href="_modules/apache_beam/runners/interactive/interactive_beam.html#Clusters.create"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Clusters.create" title="Permalink to this definition"></a></dt>
<dd><p>Creates a Dataproc cluster manager provisioned for the cluster
identified. If the cluster is known, returns an existing cluster manager.</p>
</dd></dl>
<dl class="method">
<dt id="apache_beam.runners.interactive.interactive_beam.Clusters.cleanup">
<code class="descname">cleanup</code><span class="sig-paren">(</span><em>cluster_identifier: Union[str</em>, <em>apache_beam.pipeline.Pipeline</em>, <em>apache_beam.runners.interactive.dataproc.types.ClusterMetadata</em>, <em>None] = None</em>, <em>force: bool = False</em><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="_modules/apache_beam/runners/interactive/interactive_beam.html#Clusters.cleanup"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Clusters.cleanup" title="Permalink to this definition"></a></dt>
<dd><p>Cleans up the cluster associated with the given cluster_identifier.</p>
<p>When None cluster_identifier is provided: if force is True, cleans up for
all clusters; otherwise, do a dry run and NOOP.
If a beam.Pipeline is given as the ClusterIdentifier while multiple
pipelines share the same cluster, it only cleans up the association between
the pipeline and the cluster identified.
If the cluster_identifier is unknown, NOOP.</p>
</dd></dl>
<dl class="method">
<dt id="apache_beam.runners.interactive.interactive_beam.Clusters.describe">
<code class="descname">describe</code><span class="sig-paren">(</span><em>cluster_identifier: Union[str</em>, <em>apache_beam.pipeline.Pipeline</em>, <em>apache_beam.runners.interactive.dataproc.types.ClusterMetadata</em>, <em>None] = None</em><span class="sig-paren">)</span> &#x2192; Union[apache_beam.runners.interactive.dataproc.types.ClusterMetadata, List[apache_beam.runners.interactive.dataproc.types.ClusterMetadata]]<a class="reference internal" href="_modules/apache_beam/runners/interactive/interactive_beam.html#Clusters.describe"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Clusters.describe" title="Permalink to this definition"></a></dt>
<dd><p>Describes the ClusterMetadata by a ClusterIdentifier.</p>
<p>If no cluster_identifier is given or if the cluster_identifier is unknown,
it returns descriptions for all known clusters.</p>
<p>Example usage:
# Describe the cluster executing work for a pipeline.
ib.clusters.describe(pipeline)
# Describe the cluster with the flink master url.
ib.clusters.describe(master_url)
# Describe all existing clusters.
ib.clusters.describe()</p>
</dd></dl>
<dl class="method">
<dt id="apache_beam.runners.interactive.interactive_beam.Clusters.set_default_cluster">
<code class="descname">set_default_cluster</code><span class="sig-paren">(</span><em>cluster_identifier: Union[str</em>, <em>apache_beam.pipeline.Pipeline</em>, <em>apache_beam.runners.interactive.dataproc.types.ClusterMetadata</em>, <em>None] = None</em><span class="sig-paren">)</span> &#x2192; None<a class="reference internal" href="_modules/apache_beam/runners/interactive/interactive_beam.html#Clusters.set_default_cluster"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Clusters.set_default_cluster" title="Permalink to this definition"></a></dt>
<dd><p>Temporarily sets the default metadata for creating or reusing a
DataprocClusterManager. It is always updated to the most recently created
cluster.</p>
<p>If no known ClusterMetadata can be identified by the ClusterIdentifer, NOOP.
If None is set, next time when Flink is in use, if no cluster is explicitly
configured by a pipeline, the job runs locally.</p>
</dd></dl>
<dl class="method">
<dt id="apache_beam.runners.interactive.interactive_beam.Clusters.cluster_metadata">
<code class="descname">cluster_metadata</code><span class="sig-paren">(</span><em>cluster_identifier: Union[str</em>, <em>apache_beam.pipeline.Pipeline</em>, <em>apache_beam.runners.interactive.dataproc.types.ClusterMetadata</em>, <em>None] = None</em><span class="sig-paren">)</span> &#x2192; Optional[apache_beam.runners.interactive.dataproc.types.ClusterMetadata]<a class="reference internal" href="_modules/apache_beam/runners/interactive/interactive_beam.html#Clusters.cluster_metadata"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.Clusters.cluster_metadata" title="Permalink to this definition"></a></dt>
<dd><p>Fetches the ClusterMetadata by a ClusterIdentifier that could be a
URL in string, a Beam pipeline, or an equivalent to a known ClusterMetadata;</p>
<p>If the given cluster_identifier is an URL or a pipeline that is unknown to
the current environment, the default cluster metadata (could be None) is
returned.
If the given cluster_identifier is a ClusterMetadata but unknown to the
current environment, passes it through (NOOP).</p>
</dd></dl>
</dd></dl>
<dl class="function">
<dt id="apache_beam.runners.interactive.interactive_beam.watch">
<code class="descclassname">apache_beam.runners.interactive.interactive_beam.</code><code class="descname">watch</code><span class="sig-paren">(</span><em>watchable</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/runners/interactive/interactive_beam.html#watch"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.watch" title="Permalink to this definition"></a></dt>
<dd><p>Monitors a watchable.</p>
<p>This allows Interactive Beam to implicitly pass on the information about the
location of your pipeline definition.</p>
<p>Current implementation mainly watches for PCollection variables defined in
user code. A watchable can be a dictionary of variable metadata such as
locals(), a str name of a module, a module object or an instance of a class.
The variable can come from any scope even local variables in a method of a
class defined in a module.</p>
<blockquote>
<div><p>Below are all valid:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">watch</span><span class="p">(</span><span class="n">__main__</span><span class="p">)</span> <span class="c1"># if import __main__ is already invoked</span>
<span class="n">watch</span><span class="p">(</span><span class="s1">&#39;__main__&#39;</span><span class="p">)</span> <span class="c1"># does not require invoking import __main__ beforehand</span>
<span class="n">watch</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="c1"># inside a class</span>
<span class="n">watch</span><span class="p">(</span><span class="n">SomeInstance</span><span class="p">())</span> <span class="c1"># an instance of a class</span>
<span class="n">watch</span><span class="p">(</span><span class="nb">locals</span><span class="p">())</span> <span class="c1"># inside a function, watching local variables within</span>
</pre></div>
</div>
</div></blockquote>
<p>If you write a Beam pipeline in the __main__ module directly, since the
__main__ module is always watched, you don’t have to instruct Interactive
Beam. If your Beam pipeline is defined in some module other than __main__,
such as inside a class function or a unit test, you can watch() the scope.</p>
<blockquote>
<div><p>For example:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">class</span> <span class="nc">Foo</span><span class="p">(</span><span class="nb">object</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">run_pipeline</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="k">with</span> <span class="n">beam</span><span class="o">.</span><span class="n">Pipeline</span><span class="p">()</span> <span class="k">as</span> <span class="n">p</span><span class="p">:</span>
<span class="n">init_pcoll</span> <span class="o">=</span> <span class="n">p</span> <span class="o">|</span> <span class="s1">&#39;Init Create&#39;</span> <span class="o">&gt;&gt;</span> <span class="n">beam</span><span class="o">.</span><span class="n">Create</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">))</span>
<span class="n">watch</span><span class="p">(</span><span class="nb">locals</span><span class="p">())</span>
<span class="k">return</span> <span class="n">init_pcoll</span>
<span class="n">init_pcoll</span> <span class="o">=</span> <span class="n">Foo</span><span class="p">()</span><span class="o">.</span><span class="n">run_pipeline</span><span class="p">()</span>
</pre></div>
</div>
<p>Interactive Beam caches init_pcoll for the first run.</p>
<p>Then you can use:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">show</span><span class="p">(</span><span class="n">init_pcoll</span><span class="p">)</span>
</pre></div>
</div>
<p>To visualize data from init_pcoll once the pipeline is executed.</p>
</div></blockquote>
</dd></dl>
<dl class="function">
<dt id="apache_beam.runners.interactive.interactive_beam.show">
<code class="descclassname">apache_beam.runners.interactive.interactive_beam.</code><code class="descname">show</code><span class="sig-paren">(</span><em>*pcolls</em>, <em>include_window_info=False</em>, <em>visualize_data=False</em>, <em>n='inf'</em>, <em>duration='inf'</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/runners/interactive/interactive_beam.html#show"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.show" title="Permalink to this definition"></a></dt>
<dd><p>Shows given PCollections in an interactive exploratory way if used within
a notebook, or prints a heading sampled data if used within an ipython shell.
Noop if used in a non-interactive environment.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>include_window_info</strong> – (optional) if True, windowing information of the
data will be visualized too. Default is false.</li>
<li><strong>visualize_data</strong> – (optional) by default, the visualization contains data
tables rendering data from given pcolls separately as if they are
converted into dataframes. If visualize_data is True, there will be a
more dive-in widget and statistically overview widget of the data.
Otherwise, those 2 data visualization widgets will not be displayed.</li>
<li><strong>n</strong> – (optional) max number of elements to visualize. Default ‘inf’.</li>
<li><strong>duration</strong> – (optional) max duration of elements to read in integer seconds or
a string duration. Default ‘inf’.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<p>The given pcolls can be dictionary of PCollections (as values), or iterable
of PCollections or plain PCollection values.</p>
<p>The user can specify either the max number of elements with <cite>n</cite> to read
or the maximum duration of elements to read with <cite>duration</cite>. When a limiter is
not supplied, it is assumed to be infinite.</p>
<p>By default, the visualization contains data tables rendering data from given
pcolls separately as if they are converted into dataframes. If visualize_data
is True, there will be a more dive-in widget and statistically overview widget
of the data. Otherwise, those 2 data visualization widgets will not be
displayed.</p>
<p>Ad hoc builds a pipeline fragment including only transforms that are
necessary to produce data for given PCollections pcolls, runs the pipeline
fragment to compute data for those pcolls and then visualizes the data.</p>
<p>The function is always blocking. If used within a notebook, the data
visualized might be dynamically updated before the function returns as more
and more data could getting processed and emitted when the pipeline fragment
is being executed. If used within an ipython shell, there will be no dynamic
plotting but a static plotting in the end of pipeline fragment execution.</p>
<p>The PCollections given must belong to the same pipeline.</p>
<blockquote>
<div><p>For example:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">p</span> <span class="o">=</span> <span class="n">beam</span><span class="o">.</span><span class="n">Pipeline</span><span class="p">(</span><span class="n">InteractiveRunner</span><span class="p">())</span>
<span class="n">init</span> <span class="o">=</span> <span class="n">p</span> <span class="o">|</span> <span class="s1">&#39;Init&#39;</span> <span class="o">&gt;&gt;</span> <span class="n">beam</span><span class="o">.</span><span class="n">Create</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">1000</span><span class="p">))</span>
<span class="n">square</span> <span class="o">=</span> <span class="n">init</span> <span class="o">|</span> <span class="s1">&#39;Square&#39;</span> <span class="o">&gt;&gt;</span> <span class="n">beam</span><span class="o">.</span><span class="n">Map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span> <span class="o">*</span> <span class="n">x</span><span class="p">)</span>
<span class="n">cube</span> <span class="o">=</span> <span class="n">init</span> <span class="o">|</span> <span class="s1">&#39;Cube&#39;</span> <span class="o">&gt;&gt;</span> <span class="n">beam</span><span class="o">.</span><span class="n">Map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span> <span class="o">**</span> <span class="mi">3</span><span class="p">)</span>
<span class="c1"># Below builds a pipeline fragment from the defined pipeline `p` that</span>
<span class="c1"># contains only applied transforms of `Init` and `Square`. Then the</span>
<span class="c1"># interactive runner runs the pipeline fragment implicitly to compute data</span>
<span class="c1"># represented by PCollection `square` and visualizes it.</span>
<span class="n">show</span><span class="p">(</span><span class="n">square</span><span class="p">)</span>
<span class="c1"># This is equivalent to `show(square)` because `square` depends on `init`</span>
<span class="c1"># and `init` is included in the pipeline fragment and computed anyway.</span>
<span class="n">show</span><span class="p">(</span><span class="n">init</span><span class="p">,</span> <span class="n">square</span><span class="p">)</span>
<span class="c1"># Below is similar to running `p.run()`. It computes data for both</span>
<span class="c1"># PCollection `square` and PCollection `cube`, then visualizes them.</span>
<span class="n">show</span><span class="p">(</span><span class="n">square</span><span class="p">,</span> <span class="n">cube</span><span class="p">)</span>
</pre></div>
</div>
</div></blockquote>
</dd></dl>
<dl class="function">
<dt id="apache_beam.runners.interactive.interactive_beam.collect">
<code class="descclassname">apache_beam.runners.interactive.interactive_beam.</code><code class="descname">collect</code><span class="sig-paren">(</span><em>pcoll</em>, <em>n='inf'</em>, <em>duration='inf'</em>, <em>include_window_info=False</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/runners/interactive/interactive_beam.html#collect"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.collect" title="Permalink to this definition"></a></dt>
<dd><p>Materializes the elements from a PCollection into a Dataframe.</p>
<p>This reads each element from file and reads only the amount that it needs
into memory. The user can specify either the max number of elements to read
or the maximum duration of elements to read. When a limiter is not supplied,
it is assumed to be infinite.</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>n</strong> – (optional) max number of elements to visualize. Default ‘inf’.</li>
<li><strong>duration</strong> – (optional) max duration of elements to read in integer seconds or
a string duration. Default ‘inf’.</li>
<li><strong>include_window_info</strong> – (optional) if True, appends the windowing information
to each row. Default False.</li>
</ul>
</td>
</tr>
</tbody>
</table>
<p>For example:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">p</span> <span class="o">=</span> <span class="n">beam</span><span class="o">.</span><span class="n">Pipeline</span><span class="p">(</span><span class="n">InteractiveRunner</span><span class="p">())</span>
<span class="n">init</span> <span class="o">=</span> <span class="n">p</span> <span class="o">|</span> <span class="s1">&#39;Init&#39;</span> <span class="o">&gt;&gt;</span> <span class="n">beam</span><span class="o">.</span><span class="n">Create</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">))</span>
<span class="n">square</span> <span class="o">=</span> <span class="n">init</span> <span class="o">|</span> <span class="s1">&#39;Square&#39;</span> <span class="o">&gt;&gt;</span> <span class="n">beam</span><span class="o">.</span><span class="n">Map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span> <span class="o">*</span> <span class="n">x</span><span class="p">)</span>
<span class="c1"># Run the pipeline and bring the PCollection into memory as a Dataframe.</span>
<span class="n">in_memory_square</span> <span class="o">=</span> <span class="n">head</span><span class="p">(</span><span class="n">square</span><span class="p">,</span> <span class="n">n</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
</pre></div>
</div>
</dd></dl>
<dl class="function">
<dt id="apache_beam.runners.interactive.interactive_beam.show_graph">
<code class="descclassname">apache_beam.runners.interactive.interactive_beam.</code><code class="descname">show_graph</code><span class="sig-paren">(</span><em>pipeline</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/runners/interactive/interactive_beam.html#show_graph"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.show_graph" title="Permalink to this definition"></a></dt>
<dd><p>Shows the current pipeline shape of a given Beam pipeline as a DAG.</p>
</dd></dl>
<dl class="function">
<dt id="apache_beam.runners.interactive.interactive_beam.evict_recorded_data">
<code class="descclassname">apache_beam.runners.interactive.interactive_beam.</code><code class="descname">evict_recorded_data</code><span class="sig-paren">(</span><em>pipeline=None</em><span class="sig-paren">)</span><a class="reference internal" href="_modules/apache_beam/runners/interactive/interactive_beam.html#evict_recorded_data"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#apache_beam.runners.interactive.interactive_beam.evict_recorded_data" title="Permalink to this definition"></a></dt>
<dd><p>Forcefully evicts all recorded replayable data for the given pipeline. If
no pipeline is specified, evicts for all user defined pipelines.</p>
<p>Once invoked, Interactive Beam will record new data based on the guidance of
options the next time it evaluates/visualizes PCollections or runs pipelines.</p>
</dd></dl>
</div>
</div>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="apache_beam.runners.interactive.interactive_environment.html" class="btn btn-neutral float-right" title="apache_beam.runners.interactive.interactive_environment module" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
<a href="apache_beam.runners.interactive.cache_manager.html" class="btn btn-neutral float-left" title="apache_beam.runners.interactive.cache_manager module" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
</div>
<hr/>
<div role="contentinfo">
<p>
&copy; Copyright
</p>
</div>
Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script type="text/javascript">
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>