blob: 6852e135c3980d4252fa08d8a72cc98c6f24b544 [file] [log] [blame]
<!doctype html><html lang=en class=no-js><head><meta charset=utf-8><meta http-equiv=x-ua-compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1"><title>Managing Python Pipeline Dependencies</title><meta name=description content="Apache Beam is an open source, unified model and set of language-specific SDKs for defining and executing data processing workflows, and also data ingestion and integration flows, supporting Enterprise Integration Patterns (EIPs) and Domain Specific Languages (DSLs). Dataflow pipelines simplify the mechanics of large-scale batch and streaming data processing and can run on a number of runtimes like Apache Flink, Apache Spark, and Google Cloud Dataflow (a cloud service). Beam also brings DSL in different languages, allowing users to easily implement their data integration processes."><link href="https://fonts.googleapis.com/css?family=Roboto:100,300,400,500,700" rel=stylesheet><link rel=preload href=/scss/main.min.d653ded46cd5f19a535cb20567fce9699849fe46f950d91ac6bf336db8ff8724.css as=style><link href=/scss/main.min.d653ded46cd5f19a535cb20567fce9699849fe46f950d91ac6bf336db8ff8724.css rel=stylesheet integrity><script src=https://code.jquery.com/jquery-2.2.4.min.js></script><style>.body__contained img{max-width:100%}</style><script type=text/javascript src=/js/bootstrap.min.2979f9a6e32fc42c3e7406339ee9fe76b31d1b52059776a02b4a7fa6a4fd280a.js defer></script>
<script type=text/javascript src=/js/language-switch-v2.min.121952b7980b920320ab229551857669209945e39b05ba2b433a565385ca44c6.js defer></script>
<script type=text/javascript src=/js/fix-menu.min.039174b67107465f2090a493f91e126f7aa797f29420f9edab8a54d9dd4b3d2d.js defer></script>
<script type=text/javascript src=/js/section-nav.min.1405fd5e70fab5f6c54037c269b1d137487d8f3d1b3009032525f6db3fbce991.js defer></script>
<script type=text/javascript src=/js/page-nav.min.af231204c9c52c5089d53a4c02739eacbb7f939e3be1c6ffcc212e0ac4dbf879.js defer></script>
<script type=text/javascript src=/js/expandable-list.min.75a4526624a3b8898fe7fb9e3428c205b581f8b38c7926922467aef17eac69f2.js defer></script>
<script type=text/javascript src=/js/copy-to-clipboard.min.364c06423d7e8993fc42bb4abc38c03195bc8386db26d18774ce775d08d5b18d.js defer></script>
<script type=text/javascript src=/js/calendar.min.336664054fa0f52b08bbd4e3c59b5cb6d63dcfb2b4d602839746516b0817446b.js defer></script>
<script type=text/javascript src=/js/fix-playground-nested-scroll.min.0283f1037cb1b9d5074c6eaf041292b524a8148a7cdb803d5ccd6d1fc4eb3253.js defer></script>
<script type=text/javascript src=/js/anchor-content-jump-fix.min.22d3240f81632e4c11179b9d2aaf37a40da9414333c43aa97344e8b21a7df0e4.js defer></script>
<link rel=alternate type=application/rss+xml title="Apache Beam" href=/feed.xml><link rel=canonical href=/documentation/sdks/python-pipeline-dependencies/ data-proofer-ignore><link rel="shortcut icon" type=image/x-icon href=/images/favicon.ico><link rel=stylesheet href=https://use.fontawesome.com/releases/v5.4.1/css/all.css integrity=sha384-5sAR7xN1Nv6T6+dT2mhtzEpVJvfS3NScPQTrOxhwjIuvcA67KV2R5Jz6kr4abQsz crossorigin=anonymous><link rel=stylesheet href=https://unpkg.com/swiper@8/swiper-bundle.min.css><script async src=https://platform.twitter.com/widgets.js></script>
<script>(function(e,t,n,s,o,i,a){e.GoogleAnalyticsObject=o,e[o]=e[o]||function(){(e[o].q=e[o].q||[]).push(arguments)},e[o].l=1*new Date,i=t.createElement(n),a=t.getElementsByTagName(n)[0],i.async=1,i.src=s,a.parentNode.insertBefore(i,a)})(window,document,"script","//www.google-analytics.com/analytics.js","ga"),ga("create","UA-73650088-1","auto"),ga("send","pageview")</script><script>(function(e,t,n,s,o,i){e.hj=e.hj||function(){(e.hj.q=e.hj.q||[]).push(arguments)},e._hjSettings={hjid:2182187,hjsv:6},o=t.getElementsByTagName("head")[0],i=t.createElement("script"),i.async=1,i.src=n+e._hjSettings.hjid+s+e._hjSettings.hjsv,o.appendChild(i)})(window,document,"https://static.hotjar.com/c/hotjar-",".js?sv=")</script></head><body class=body data-spy=scroll data-target=.page-nav data-offset=0><nav class="navigation-bar-mobile header navbar navbar-fixed-top"><div class=navbar-header><a href=/ class=navbar-brand><img alt=Brand style=height:46px;width:43px src=/images/beam_logo_navbar_mobile.png></a>
<a class=navbar-link href=/get-started/>Get Started</a>
<a class=navbar-link href=/documentation/>Documentation</a>
<button type=button class="navbar-toggle menu-open" aria-expanded=false aria-controls=navbar onclick=openMenu()>
<span class=sr-only>Toggle navigation</span>
<span class=icon-bar></span>
<span class=icon-bar></span>
<span class=icon-bar></span></button></div><div class="navbar-mask closed"></div><div id=navbar class="navbar-container closed"><button type=button class=navbar-toggle aria-expanded=false aria-controls=navbar id=closeMenu>
<span class=sr-only>Toggle navigation</span>
<span class=icon-bar></span>
<span class=icon-bar></span>
<span class=icon-bar></span></button><ul class="nav navbar-nav"><li><div class=searchBar-mobile><script>(function(){var t,n="012923275103528129024:4emlchv9wzi",e=document.createElement("script");e.type="text/javascript",e.async=!0,e.src="https://cse.google.com/cse.js?cx="+n,t=document.getElementsByTagName("script")[0],t.parentNode.insertBefore(e,t)})()</script><gcse:search></gcse:search></div></li><li><a class=navbar-link href=/about>About</a></li><li><a class=navbar-link href=/get-started/>Get Started</a></li><li><span class=navbar-link>Documentation</span><ul><li><a href=/documentation/>General</a></li><li><a href=/documentation/sdks/java/>Languages</a></li><li><a href=/documentation/runners/capability-matrix/>Runners</a></li><li><a href=/documentation/io/connectors/>I/O Connectors</a></li></ul></li><li><a class=navbar-link href=/roadmap/>Roadmap</a></li><li><a class=navbar-link href=/community/>Community</a></li><li><a class=navbar-link href=/contribute/>Contribute</a></li><li><a class=navbar-link href=/blog/>Blog</a></li><li><a class=navbar-link href=/case-studies/>Case Studies</a></li></ul><ul class="nav navbar-nav navbar-right"><li><a href=https://github.com/apache/beam/edit/master/website/www/site/content/en/documentation/sdks/python-pipeline-dependencies.md data-proofer-ignore><svg xmlns="http://www.w3.org/2000/svg" width="25" height="24" fill="none" viewBox="0 0 25 24"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2.75" d="M4.543 20h4l10.5-10.5c.53-.53.828-1.25.828-2s-.298-1.47-.828-2-1.25-.828-2-.828-1.47.298-2 .828L4.543 16v4zm9.5-13.5 4 4"/></svg></a></li><li class=dropdown><a href=# class=dropdown-toggle id=apache-dropdown data-toggle=dropdown role=button aria-haspopup=true aria-expanded=false><img src=https://www.apache.org/foundation/press/kit/feather_small.png alt="Apache Logo" style=height:20px>
&nbsp;Apache
<span class=arrow-icon><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" fill="none" viewBox="0 0 20 20"><circle cx="10" cy="10" r="10" fill="#ff6d00"/><path stroke="#fff" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M8.535 5.28l4.573 4.818-4.573 4.403"/></svg></span></a><ul class="dropdown-menu dropdown-menu-right"><li><a target=_blank href=https://www.apache.org/>ASF Homepage</a></li><li><a target=_blank href=https://www.apache.org/licenses/>License</a></li><li><a target=_blank href=https://www.apache.org/security/>Security</a></li><li><a target=_blank href=https://www.apache.org/foundation/thanks.html>Thanks</a></li><li><a target=_blank href=https://www.apache.org/foundation/sponsorship.html>Sponsorship</a></li><li><a target=_blank href=https://www.apache.org/foundation/policies/conduct>Code of Conduct</a></li></ul></li></ul></div></nav><nav class=navigation-bar-desktop><a href=/ class=navbar-logo><img src=/images/beam_logo_navbar.png alt="Beam Logo"></a><div class=navbar-bar-left><div class=navbar-links><a class=navbar-link href=/about>About</a>
<a class=navbar-link href=/get-started/>Get Started</a><li class="dropdown navbar-dropdown navbar-dropdown-documentation"><a href=# class="dropdown-toggle navbar-link" role=button aria-haspopup=true aria-expanded=false>Documentation
<span><svg xmlns="http://www.w3.org/2000/svg" width="12" height="11" fill="none" viewBox="0 0 12 11"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10.666 4.535 5.847 9.108 1.444 4.535"/></svg></span></a><ul class=dropdown-menu><li><a class=navbar-dropdown-menu-link href=/documentation/>General</a></li><li><a class=navbar-dropdown-menu-link href=/documentation/sdks/java/>Languages</a></li><li><a class=navbar-dropdown-menu-link href=/documentation/runners/capability-matrix/>Runners</a></li><li><a class=navbar-dropdown-menu-link href=/documentation/io/connectors/>I/O Connectors</a></li></ul></li><a class=navbar-link href=/roadmap/>Roadmap</a>
<a class=navbar-link href=/community/>Community</a>
<a class=navbar-link href=/contribute/>Contribute</a>
<a class=navbar-link href=/blog/>Blog</a>
<a class=navbar-link href=/case-studies/>Case Studies</a></div><div id=iconsBar><a type=button onclick=showSearch()><svg xmlns="http://www.w3.org/2000/svg" width="25" height="24" fill="none" viewBox="0 0 25 24"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2.75" d="M10.191 17c3.866.0 7-3.134 7-7s-3.134-7-7-7-7 3.134-7 7 3.134 7 7 7zm11 4-6-6"/></svg></a><a target=_blank href=https://github.com/apache/beam/edit/master/website/www/site/content/en/documentation/sdks/python-pipeline-dependencies.md data-proofer-ignore><svg xmlns="http://www.w3.org/2000/svg" width="25" height="24" fill="none" viewBox="0 0 25 24"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2.75" d="M4.543 20h4l10.5-10.5c.53-.53.828-1.25.828-2s-.298-1.47-.828-2-1.25-.828-2-.828-1.47.298-2 .828L4.543 16v4zm9.5-13.5 4 4"/></svg></a><li class="dropdown navbar-dropdown navbar-dropdown-apache"><a href=# class=dropdown-toggle role=button aria-haspopup=true aria-expanded=false><img src=https://www.apache.org/foundation/press/kit/feather_small.png alt="Apache Logo" style=height:20px>
&nbsp;Apache
<span class=arrow-icon><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" fill="none" viewBox="0 0 20 20"><circle cx="10" cy="10" r="10" fill="#ff6d00"/><path stroke="#fff" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M8.535 5.28l4.573 4.818-4.573 4.403"/></svg></span></a><ul class=dropdown-menu><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/>ASF Homepage</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/licenses/>License</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/security/>Security</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/foundation/thanks.html>Thanks</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/foundation/sponsorship.html>Sponsorship</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/foundation/policies/conduct>Code of Conduct</a></li></ul></li></div><div class="searchBar disappear"><script>(function(){var t,n="012923275103528129024:4emlchv9wzi",e=document.createElement("script");e.type="text/javascript",e.async=!0,e.src="https://cse.google.com/cse.js?cx="+n,t=document.getElementsByTagName("script")[0],t.parentNode.insertBefore(e,t)})()</script><gcse:search></gcse:search>
<a type=button onclick=endSearch()><svg xmlns="http://www.w3.org/2000/svg" width="25" height="25" fill="none" viewBox="0 0 25 25"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2.75" d="M21.122 20.827 4.727 4.432M21.122 4.43 4.727 20.827"/></svg></a></div></div></nav><div class=header-push></div><div class="top-banners swiper"><div class=swiper-wrapper><div class=swiper-slide><a href=https://tour.beam.apache.org><img class=banner-img-desktop src=/images/banners/tour-of-beam/tour-of-beam-desktop.png alt="Start Tour of Beam">
<img class=banner-img-mobile src=/images/banners/tour-of-beam/tour-of-beam-mobile.png alt="Start Tour of Beam"></a></div><div class=swiper-slide><a href=https://beam.apache.org/documentation/ml/overview/><img class=banner-img-desktop src=/images/banners/machine-learning/machine-learning-desktop.jpg alt="Machine Learning">
<img class=banner-img-mobile src=/images/banners/machine-learning/machine-learning-mobile.jpg alt="Machine Learning"></a></div></div><div class=swiper-pagination></div><div class=swiper-button-prev></div><div class=swiper-button-next></div></div><script src=/js/swiper-bundle.min.min.e0e8f81b0b15728d35ff73c07f42ddbb17a108d6f23df4953cb3e60df7ade675.js></script>
<script src=/js/sliders/top-banners.min.afa7d0a19acf7a3b28ca369490b3d401a619562a2a4c9612577be2f66a4b9855.js></script>
<script>function showSearch(){addPlaceholder();var e,t=document.querySelector(".searchBar");t.classList.remove("disappear"),e=document.querySelector("#iconsBar"),e.classList.add("disappear")}function addPlaceholder(){$("input:text").attr("placeholder","What are you looking for?")}function endSearch(){var e,t=document.querySelector(".searchBar");t.classList.add("disappear"),e=document.querySelector("#iconsBar"),e.classList.remove("disappear")}function blockScroll(){$("body").toggleClass("fixedPosition")}function openMenu(){addPlaceholder(),blockScroll()}</script><div class="clearfix container-main-content"><div class="section-nav closed" data-offset-top=90 data-offset-bottom=500><span class="section-nav-back glyphicon glyphicon-menu-left"></span><nav><ul class=section-nav-list data-section-nav><li><span class=section-nav-list-main-title>Languages</span></li><li><span class=section-nav-list-title>Java</span><ul class=section-nav-list><li><a href=/documentation/sdks/java/>Java SDK overview</a></li><li><a href=https://beam.apache.org/releases/javadoc/2.55.1/ target=_blank>Java SDK API reference <img src=/images/external-link-icon.png width=14 height=14 alt="External link."></a></li><li><a href=/documentation/sdks/java-dependencies/>Java SDK dependencies</a></li><li><a href=/documentation/sdks/java-extensions/>Java SDK extensions</a></li><li><a href=/documentation/sdks/java-thirdparty/>Java 3rd party extensions</a></li><li><a href=/documentation/sdks/java/testing/nexmark/>Nexmark benchmark suite</a></li><li><a href=/documentation/sdks/java/testing/tpcds/>TPC-DS benchmark suite</a></li><li><a href=/documentation/sdks/java-multi-language-pipelines/>Java multi-language pipelines quickstart</a></li></ul></li><li><span class=section-nav-list-title>Python</span><ul class=section-nav-list><li><a href=/documentation/sdks/python/>Python SDK overview</a></li><li><a href=https://beam.apache.org/releases/pydoc/2.55.1/ target=_blank>Python SDK API reference <img src=/images/external-link-icon.png width=14 height=14 alt="External link."></a></li><li><a href=/documentation/sdks/python-dependencies/>Python SDK dependencies</a></li><li><a href=/documentation/sdks/python-streaming/>Python streaming pipelines</a></li><li><a href=/documentation/sdks/python-type-safety/>Ensuring Python type safety</a></li><li><a href=/documentation/sdks/python-machine-learning/>Machine Learning</a></li><li><a href=/documentation/sdks/python-pipeline-dependencies/>Managing pipeline dependencies</a></li><li><a href=/documentation/sdks/python-multi-language-pipelines/>Python multi-language pipelines quickstart</a></li><li><a href=/documentation/sdks/python-unrecoverable-errors/>Python Unrecoverable Errors</a></li></ul></li><li><span class=section-nav-list-title>Go</span><ul class=section-nav-list><li><a href=/documentation/sdks/go/>Go SDK overview</a></li><li><a href=https://pkg.go.dev/github.com/apache/beam/sdks/v2/go/pkg/beam target=_blank>Go SDK API reference <img src=/images/external-link-icon.png width=14 height=14 alt="External link."></a><li><a href=/documentation/sdks/go-dependencies/>Go SDK dependencies</a></li><li><a href=/documentation/sdks/go-cross-compilation/>Cross compilation</a></li></li></ul></li><li><span class=section-nav-list-title>Typescript</span><ul class=section-nav-list><li><a href=/documentation/sdks/typescript/>Typescript SDK overview</a></li><li><a href=https://beam.apache.org/releases/typedoc/current/ target=_blank>Typescript SDK API reference <img src=/images/external-link-icon.png width=14 height=14 alt="External link."></a></li></ul></li><li><span class=section-nav-list-title>Scala</span><ul class=section-nav-list><li><a href=/documentation/sdks/scala/>Scio</a></li><li><a href=https://spotify.github.io/scio/api/com/spotify/scio/index.html target=_blank>Scio SDK API reference <img src=/images/external-link-icon.png width=14 height=14 alt="External link."></a></li></ul></li><li><span class=section-nav-list-title>Yaml</span><ul class=section-nav-list><li><a href=/documentation/sdks/yaml/>Yaml overview</a></li><li><a href=/documentation/sdks/yaml-udf/>Yaml User Defined Functions</a></li><li><a href=/documentation/sdks/yaml-combine/>Yaml Aggregation</a></li><li><a href=/documentation/sdks/yaml-errors/>Error handling</a></li><li><a href=/documentation/sdks/yaml-inline-python/>Inlining Python</a></li><li><a href=https://beam.apache.org/releases/yamldoc/current/ target=_blank>YAML API reference <img src=/images/external-link-icon.png width=14 height=14 alt="External link."></a></ul></li><li><span class=section-nav-list-title>SQL</span><ul class=section-nav-list><li><a href=/documentation/dsls/sql/overview/>Overview</a></li><li><a href=/documentation/dsls/sql/walkthrough/>Walkthrough</a></li><li><a href=/documentation/dsls/sql/shell/>Shell</a></li><li class=section-nav-item--collapsible><span class=section-nav-list-title>Apache Calcite dialect</span><ul class=section-nav-list><li><a href=/documentation/dsls/sql/calcite/overview/>Calcite support overview</a></li><li><a href=/documentation/dsls/sql/calcite/query-syntax/>Query syntax</a></li><li><a href=/documentation/dsls/sql/calcite/lexical/>Lexical structure</a></li><li><a href=/documentation/dsls/sql/calcite/data-types/>Data types</a></li><li><a href=/documentation/dsls/sql/calcite/scalar-functions/>Scalar functions</a></li><li><a href=/documentation/dsls/sql/calcite/aggregate-functions/>Aggregate functions</a></li></ul></li><li class=section-nav-item--collapsible><span class=section-nav-list-title>ZetaSQL dialect</span><ul class=section-nav-list><li><a href=/documentation/dsls/sql/zetasql/overview/>ZetaSQL support overview</a></li><li><a href=/documentation/dsls/sql/zetasql/syntax/>Function call rules</a></li><li><a href=/documentation/dsls/sql/zetasql/conversion-rules/>Conversion rules</a></li><li><a href=/documentation/dsls/sql/zetasql/query-syntax/>Query syntax</a></li><li><a href=/documentation/dsls/sql/zetasql/lexical/>Lexical structure</a></li><li><a href=/documentation/dsls/sql/zetasql/data-types/>Data types</a></li><li><a href=/documentation/dsls/sql/zetasql/operators/>Operators</a></li><li class=section-nav-item--collapsible><span class=section-nav-list-title>Scalar functions</span><ul class=section-nav-list><li><a href=/documentation/dsls/sql/zetasql/string-functions/>String functions</a></li><li><a href=/documentation/dsls/sql/zetasql/math-functions/>Mathematical functions</a></li><li><a href=/documentation/dsls/sql/zetasql/conditional-expressions/>Conditional expressions</a></li></ul></li><li><a href=/documentation/dsls/sql/zetasql/aggregate-functions/>Aggregate functions</a></li></ul></li><li class=section-nav-item--collapsible><span class=section-nav-list-title>Beam SQL extensions</span><ul class=section-nav-list><li><a href=/documentation/dsls/sql/extensions/create-external-table/>CREATE EXTERNAL TABLE</a></li><li><a href=/documentation/dsls/sql/extensions/windowing-and-triggering/>Windowing & triggering</a></li><li><a href=/documentation/dsls/sql/extensions/joins/>Joins</a></li><li><a href=/documentation/dsls/sql/extensions/user-defined-functions/>User-defined functions</a></li><li><a href=/documentation/dsls/sql/extensions/set/>SET pipeline options</a></li></ul></li></ul></li><li><span class=section-nav-list-title>DataFrames</span><ul class=section-nav-list><li><a href=/documentation/dsls/dataframes/overview/>Overview</a></li><li><a href=/documentation/dsls/dataframes/differences-from-pandas/>Differences from pandas</a></li><li><a href=https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/dataframe target=_blank>Example pipelines <img src=/images/external-link-icon.png width=14 height=14 alt="External link."></a></li><li><a href=https://beam.apache.org/releases/pydoc/2.55.1/apache_beam.dataframe.html target=_blank>DataFrame API reference <img src=/images/external-link-icon.png width=14 height=14 alt="External link."></a></li></ul></li></ul></nav></div><nav class="page-nav clearfix" data-offset-top=90 data-offset-bottom=500><nav id=TableOfContents><ul><li><a href=#pypi-dependencies>PyPI Dependencies</a></li><li><a href=#custom-containers>Custom Containers</a></li><li><a href=#local-or-nonpypi>Local Python packages or non-public Python Dependencies</a></li><li><a href=#multiple-file-dependencies>Multiple File Dependencies</a></li><li><a href=#nonpython>Non-Python Dependencies or PyPI Dependencies with Non-Python Dependencies</a></li><li><a href=#pre-building-sdk-container-image>Pre-building SDK Container Image</a></li><li><a href=#pickling-and-managing-the-main-session>Pickling and Managing the Main Session</a></li><li><a href=#control-dependencies>Control the dependencies the pipeline uses</a><ul><li><a href=#pipeline-environments>Pipeline environments</a></li><li><a href=#create-reproducible-environments>Create reproducible environments</a></li><li><a href=#make-the-pipeline-runtime-environment-reproducible>Make the pipeline runtime environment reproducible</a></li><li><a href=#make-the-pipeline-launch-environment-reproducible>Make the pipeline launch environment reproducible</a></li><li><a href=#make-the-launch-environment-compatible-with-the-runtime-environment>Make the launch environment compatible with the runtime environment</a></li></ul></li></ul></nav></nav><div class="body__contained body__section-nav"><h1 id=managing-python-pipeline-dependencies>Managing Python Pipeline Dependencies</h1><p>Dependency management is about specifying dependencies that your pipeline requires, and controlling which dependencies are used in production.</p><p><strong>Note:</strong> Remote workers used for pipeline execution typically have a standard Python distribution installation in a Debian-based container image. If your code relies only on standard Python packages, then you probably don&rsquo;t need to do anything on this page.</p><h2 id=pypi-dependencies>PyPI Dependencies</h2><p>If your pipeline uses public packages from the <a href=https://pypi.python.org/>Python Package Index</a>, you must make these packages available remotely on the workers.</p><p>For pipelines that consists only of a single Python file or a notebook, the most straightforward way to supply dependencies is to provide a
<code>requirements.txt</code> file. For more complex scenarios, define the <a href=#multiple-file-dependencies>pipeline in a package</a> and consider installing your dependencies in a <a href=#custom-containers>custom container</a>.</p><p>To supply a requirements.txt file:</p><ol><li><p>Find out which packages are installed on your machine. Run the following command:</p><pre><code> pip freeze &gt; requirements.txt
</code></pre><p>This command creates a <code>requirements.txt</code> file that lists all packages that are installed on your machine, regardless of where they were installed from.</p></li><li><p>Edit the <code>requirements.txt</code> file and delete all packages that are not relevant to your code.</p></li><li><p>Run your pipeline with the following command-line option:</p><pre><code> --requirements_file requirements.txt
</code></pre><p>The runner will use the <code>requirements.txt</code> file to install your additional dependencies onto the remote workers.</p></li></ol><blockquote><p><strong>NOTE</strong>: As an alternative to <code>pip freeze</code>, use a library like <a href=https://github.com/jazzband/pip-tools>pip-tools</a> to compile all of the dependencies required for the pipeline from a <code>requirements.in</code> file. In the <code>requirements.in</code> file, only the top-level dependencies are mentioned.</p></blockquote><p>When you supply the <code>--requirements_file</code> pipeline option, during pipeline submission, Beam downloads
the specified packages locally into a requirements cache directory,
and then stages the requirements cache directory to the runner.
At runtime, when available, Beam installs packages from the requirements cache.
This mechanism makes it possible to stage the dependency packages to the runner
at submission. At runtime, the runner workers might be able to install the
packages from the cache without needing a connection to PyPI. To disable staging the
requirements, use the <code>--requirements_cache=skip</code> pipeline option.
For more information, see the <a href=https://beam.apache.org/releases/pydoc/current/_modules/apache_beam/options/pipeline_options.html#SetupOptions>help descriptions of these pipeline options</a>.</p><h2 id=custom-containers>Custom Containers</h2><p>You can pass a <a href="https://hub.docker.com/search?q=apache%2Fbeam&amp;type=image">container</a> image with all the dependencies that are needed for the pipeline. <a href=/documentation/runtime/environments/#running-pipelines>Follow the instructions the show how to run the pipeline with custom container images</a>.</p><ol><li><p>If you are using a custom container image, we recommend that you install the dependencies from the <code>--requirements_file</code> directly into your image at build time. In this case, you do not need to pass <code>--requirements_file</code> option at runtime, which will reduce the pipeline startup time.</p><pre><code># Add these lines with the path to the requirements.txt to the Dockerfile
COPY &lt;path to requirements.txt&gt; /tmp/requirements.txt
RUN python -m pip install -r /tmp/requirements.txt
</code></pre></li></ol><h2 id=local-or-nonpypi>Local Python packages or non-public Python Dependencies</h2><p>If your pipeline uses packages that are not available publicly (e.g. packages that you&rsquo;ve downloaded from a GitHub repo), make these packages available remotely by performing the following steps:</p><ol><li><p>Identify which packages are installed on your machine and are not public. Run the following command:</p><p>pip freeze</p><p>This command lists all packages that are installed on your machine, regardless of where they were installed from.</p><ol><li><p>Run your pipeline with the following command-line option:</p><pre><code> --extra_package /path/to/package/package-name
</code></pre><p>where package-name is the package&rsquo;s tarball. You can build the package tarball using a command line tool called <a href=https://setuptools.pypa.io/en/latest/userguide/quickstart.html#install-build>build</a>.</p><pre><code> # Install build using pip
pip install --upgrade build
python -m build --sdist
</code></pre><p>See the <a href=https://pypa-build.readthedocs.io/en/latest/index.html>build documentation</a> for more details on this command.</p></li></ol></li></ol><h2 id=multiple-file-dependencies>Multiple File Dependencies</h2><p>Often, your pipeline code spans multiple files. To run your project remotely, you must group these files as a Python package and specify the package when you run your pipeline. When the remote workers start, they will install your package. To group your files as a Python package and make it available remotely, perform the following steps:</p><ol><li><p>Create a <a href=https://pythonhosted.org/an_example_pypi_project/setuptools.html>setup.py</a> file for your project. The following is a very basic <code>setup.py</code> file.</p><pre><code> import setuptools
setuptools.setup(
name='PACKAGE-NAME',
version='PACKAGE-VERSION',
install_requires=[
# List Python packages your pipeline depends on.
],
packages=setuptools.find_packages(),
)
</code></pre></li><li><p>Structure your project so that the root directory contains the <code>setup.py</code> file, the main workflow file, and a directory with the rest of the files, for example:</p><pre><code> root_dir/
setup.py
main.py
my_package/
my_pipeline_launcher.py
my_custom_dofns_and_transforms.py
other_utils_and_helpers.py
</code></pre><p>See <a href=https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/complete/juliaset>Juliaset</a> for an example that follows this project structure.</p></li><li><p>Install your package in the submission environment, for example by using the following command:</p><pre><code> pip install -e .
</code></pre></li><li><p>Run your pipeline with the following command-line option:</p><pre><code> --setup_file /path/to/setup.py
</code></pre></li></ol><p><strong>Note:</strong> It is not necessary to supply the <code>--requirements_file</code> <a href=#pypi-dependencies>option</a> if the dependencies of your package are defined in the <code>install_requires</code> field of the <code>setup.py</code> file (see step 1).
However unlike with the <code>--requirements_file</code> option, when you use the <code>--setup_file</code> option, Beam doesn&rsquo;t stage the dependent packages to the runner.
Only the pipeline package is staged. If they aren&rsquo;t already provided in the runtime environment,
the package dependencies are installed from PyPI at runtime.</p><h2 id=nonpython>Non-Python Dependencies or PyPI Dependencies with Non-Python Dependencies</h2><p>If your pipeline uses non-Python packages, such as packages that require installation using the <code>apt install</code> command, or uses a PyPI package that depends on non-Python dependencies during package installation, we recommend installing them using a <a href=#custom-containers>custom container</a>.
Otherwise, you must perform the following steps.</p><ol><li><p><a href=#multiple-file-dependencies>Structure your pipeline as a package</a>.</p></li><li><p>Add the required installation commands for the non-Python dependencies, such as the <code>apt install</code> commands, to the list of <code>CUSTOM_COMMANDS</code> in your <code>setup.py</code> file. See the <a href=https://github.com/apache/beam/blob/master/sdks/python/apache_beam/examples/complete/juliaset/setup.py>Juliaset setup.py file</a> for an example.</p><p><strong>Note:</strong> You must verify that these commands run on the remote worker. For example, if you use <code>apt</code>, the remote worker needs <code>apt</code> support.</p></li><li><p>Run your pipeline with the following command-line option:</p><pre><code> --setup_file /path/to/setup.py
</code></pre></li></ol><p><strong>Note:</strong> Because custom commands execute after the dependencies for your workflow are installed (by <code>pip</code>), you should omit the PyPI package dependency from the pipeline&rsquo;s <code>requirements.txt</code> file and from the <code>install_requires</code> parameter in the <code>setuptools.setup()</code> call of your <code>setup.py</code> file.</p><h2 id=pre-building-sdk-container-image>Pre-building SDK Container Image</h2><p>In pipeline execution modes where a Beam runner launches SDK workers in Docker containers, the additional pipeline dependencies (specified via <code>--requirements_file</code> and other runtime options) are installed into the containers at runtime. This can increase the worker startup time.
However, it may be possible to pre-build the SDK containers and perform the dependency installation once before the workers start with <code>--prebuild_sdk_container_engine</code>. For instructions of how to use pre-building with Google Cloud
Dataflow, see <a href=https://cloud.google.com/dataflow/docs/guides/using-custom-containers#prebuild>Pre-building the python SDK custom container image with extra dependencies</a>.</p><p><strong>NOTE</strong>: This feature is available only for the <code>Dataflow Runner v2</code>.</p><h2 id=pickling-and-managing-the-main-session>Pickling and Managing the Main Session</h2><p>When the Python SDK submits the pipeline for execution to a remote runner, the pipeline contents, such as transform user code, is serialized (or pickled) into a bytecode using
libraries that perform the serialization (also called picklers). The default pickler library used by Beam is <code>dill</code>.
To use the <code>cloudpickle</code> pickler, supply the <code>--pickle_library=cloudpickle</code> pipeline option. The <code>cloudpickle</code> support is currently <a href=https://github.com/apache/beam/issues/21298>experimental</a>.</p><p>By default, global imports, functions, and variables defined in the main pipeline module are not saved during the serialization of a Beam job.
Thus, one might encounter an unexpected <code>NameError</code> when running a <code>DoFn</code> on any remote runner. To resolve this, supply the main session content with the pipeline by
setting the <code>--save_main_session</code> pipeline option. This will load the pickled state of the global namespace onto the Dataflow workers (if using <code>DataflowRunner</code>).
For example, see <a href=https://cloud.google.com/dataflow/docs/guides/common-errors#name-error>Handling NameErrors</a> to set the main session on the <code>DataflowRunner</code>.</p><p>Managing the main session in Python SDK is only necessary when using <code>dill</code> pickler on any remote runner. Therefore, this issue will
not occur in <code>DirectRunner</code>.</p><p>Since serialization of the pipeline happens on the job submission, and deserialization happens at runtime, it is imperative that the same version of pickling library is used at job submission and at runtime.
To ensure this, Beam typically sets a very narrow supported version range for pickling libraries. If for whatever reason, users cannot use the version of <code>dill</code> or <code>cloudpickle</code> required by Beam, and choose to
install a custom version, they must also ensure that they use the same custom version at runtime (e.g. in their custom container,
or by specifying a pipeline dependency requirement).</p><h2 id=control-dependencies>Control the dependencies the pipeline uses</h2><h3 id=pipeline-environments>Pipeline environments</h3><p>To run a Python pipeline on a remote runner, Apache Beam translates the pipeline into a <a href=https://github.com/apache/beam/blob/master/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/beam_runner_api.proto>runner-independent representation</a> and submits it for execution. Translation happens in the <strong>launch environment</strong>. You can launch the pipeline from a Python virtual environment with the installed Beam SDK, or with tools like <a href=https://cloud.google.com/dataflow/docs/guides/templates/using-flex-templates>Dataflow Flex Templates</a>, <a href=https://cloud.google.com/dataflow/docs/guides/interactive-pipeline-development>Notebook environments</a>, <a href=https://airflow.apache.org/>Apache Airflow</a>, and more.</p><p>The <a href=https://beam.apache.org/documentation/runtime/environments/><strong>runtime environment</strong></a> is the Python environment that a runner uses during pipeline execution. This environment is where the pipeline code runs to when it performs data processing. The runtime environment includes Apache Beam and pipeline runtime dependencies.</p><h3 id=create-reproducible-environments>Create reproducible environments</h3><p>You can use several tools to build reproducible Python environments:</p><ul><li><p><strong>Use <a href=https://pip.pypa.io/en/stable/user_guide/#requirements-files>requirements files</a>.</strong> After you install dependencies, generate the requirements file by using <code>pip freeze > requirements.txt</code>. To recreate an environment, install dependencies from the requirements.txt file by using <code>pip install -r requirements.txt</code>.</p></li><li><p><strong>Use <a href=https://pip.pypa.io/en/stable/user_guide/#constraints-files>constraint files</a>.</strong> You can use the constraint list to restrict the installation of packages, allowing only specified versions.</p></li><li><p><strong>Use lock files.</strong> Use dependency management tools like <a href=https://pipenv.pypa.io/en/latest/>PipEnv</a>, <a href=https://python-poetry.org/>Poetry</a>, and <a href=https://github.com/jazzband/pip-tools>pip-tools</a> to specify top-level dependencies, to generate lock files of all transitive dependencies with pinned versions, and to create virtual environments from these lockfiles.</p></li><li><p><strong>Use Docker container images.</strong> You can package the launch and runtime environment inside a Docker container image. If the image includes all necessary dependencies, then the environment only changes when a container image is rebuilt.</p></li></ul><p>Use version control for the configuration files that define the environment.</p><h3 id=make-the-pipeline-runtime-environment-reproducible>Make the pipeline runtime environment reproducible</h3><p>When a pipeline uses a reproducible runtime environment on a remote runner, the workers on the runner use the same dependencies each time the pipeline runs. A reproducible environment is immune to side-effects caused by releases of the pipeline&rsquo;s direct or transitive dependencies. It doesn’t require dependency resolution at runtime.</p><p>You can create a reproducible runtime environment in the following ways:</p><ul><li><p>Run your pipeline in a custom container image that has all dependencies for your pipeline. Use the <code>--sdk_container_image</code> pipeline option.</p></li><li><p>Supply an exhaustive list of the pipeline&rsquo;s dependencies in the <code>--requirements_file</code> pipeline option. Use the <code>--prebuild_sdk_container_engine</code> option to perform the runtime environment initialization sequence before the pipeline execution. If your dependencies don&rsquo;t change, reuse the prebuilt image by using the <code>--sdk_container_image</code> option.</p></li></ul><p>A self-contained runtime environment is usually reproducible. To check if the runtime environment is self-contained, restrict internet access to PyPI in the pipeline runtime. If you use the Dataflow Runner, see the documentation for the <a href=https://cloud.google.com/dataflow/docs/guides/routes-firewall#turn_off_external_ip_address><code>--no_use_public_ips</code></a> pipeline option.</p><p>If you need to recreate or upgrade the runtime environment, do so in a controlled way with visibility into changed dependencies:</p><ul><li><p>Do not modify container images when they are in use by running pipelines.</p></li><li><p>Avoid using the tag <code>:latest</code> with your custom images. Tag your builds with a date or a unique identifier. If something goes wrong, using this type of tag might make it possible to revert the pipeline execution to a previously known working configuration and allow for an inspection of changes.</p></li><li><p>Consider storing the output of <code>pip freeze</code> or the contents of <code>requirements.txt</code> in the version control system.</p></li></ul><h3 id=make-the-pipeline-launch-environment-reproducible>Make the pipeline launch environment reproducible</h3><p>The launch environment runs the <strong>production version</strong> of the pipeline. While developing the pipeline locally, you might use a <strong>development environment</strong> that includes dependencies for development, such as Jupyter or Pylint. The launch environment for production pipelines might not need these additional dependencies. You can construct and maintain it separately from the development environment.</p><p>To reduce side-effects on pipeline submissions, it is best to able to <a href=#create-reproducible-environments>recreate the launch environment in a reproducible manner</a>.</p><p><a href=https://cloud.google.com/dataflow/docs/guides/templates/using-flex-templates>Dataflow Flex Templates</a> provide an example of a containerized, reproducible launch environment.</p><p>To create reproducible installations of Beam into a clean virtual environment, use <a href=https://pip.pypa.io/en/stable/user_guide/#requirements-files>requirements files</a> that list all Python dependencies included in Beam&rsquo;s default container images constraint files:</p><pre tabindex=0><code>BEAM_VERSION=2.48.0
PYTHON_VERSION=`python -c &#34;import sys; print(f&#39;{sys.version_info.major}{sys.version_info.minor}&#39;)&#34;`
pip install apache-beam==$BEAM_VERSION --constraint https://raw.githubusercontent.com/apache/beam/release-${BEAM_VERSION}/sdks/python/container/py${PY_VERSION}/base_image_requirements.txt
</code></pre><p>Use a constraint file to ensure that Beam dependencies in the launch environment match the versions in default Beam containers. A constraint file might also remove the need for dependency resolution at installation time.</p><h3 id=make-the-launch-environment-compatible-with-the-runtime-environment>Make the launch environment compatible with the runtime environment</h3><p>The launch environment translates the pipeline graph into a <a href=https://github.com/apache/beam/blob/master/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/beam_runner_api.proto>runner-independent representation</a>. This process involves serializing (or pickling) the code of the transforms. The serialized content is deserialized on the workers. If the runtime worker environment significantly differs from the launch environment, runtime errors might occur for the following reasons:</p><ul><li><p>The Apache Beam version must match in the submission and runtime environments. Python major.minor versions must match as well. Otherwise, the pipeline might fail with errors like <code>Pipeline construction environment and pipeline runtime environment are not compatible</code>. On older SDK versions, the error might be reported as <code>SystemError: unknown opcode</code>.</p></li><li><p>Versions of <code>protobuf</code> in the submission and runtime environment need to match or be compatible.</p></li><li><p>Libraries used in the pipeline code might need to match. If serialized pipeline code has references to functions or modules that aren’t available on the workers, the pipeline might fail with <code>ModuleNotFound</code> or <code>AttributeError</code> exceptions on the remote runner. If you encounter such errors, make sure that the affected libraries are available on the remote worker, and check whether you need to <a href=https://beam.apache.org/documentation/sdks/python-pipeline-dependencies/#pickling-and-managing-the-main-session>save the main session</a>.</p></li><li><p>The version of the pickling library used at submission time must match the version installed at runtime. To enforce this, Beam sets a tight bounds on the version of serializer libraries (dill and cloudpickle). You can force install a different version of <code>dill</code> or <code>cloudpickle</code> than required by Beam under the following conditions:</p><ul><li>You install the same version in submission and in the runtime environment.</li><li>The chosen version works for your pipeline.</li></ul></li></ul><p>To check whether the runtime environment matches the launch environment, inspect differences in the <code>pip freeze</code> output in both environments. Update to the latest version of Beam, because environment compatibility checks are included in newer SDK versions.</p><p>Finally, you can use the same environment by launching the pipeline from the containerized environment that you use at runtime. <a href=https://cloud.google.com/dataflow/docs/guides/templates/configuring-flex-templates#use_custom_container_images>Dataflow Flex templates built from a custom container image</a> offer this setup. In this scenario, you can recreate both launch and runtime environments in a reproducible manner. Because both containers are created from the same image, the launch and runtime environments are compatible with each other by default.</p></div></div><footer class=footer><div class=footer__contained><div class=footer__cols><div class="footer__cols__col footer__cols__col__logos"><div class=footer__cols__col__logo><img src=/images/beam_logo_circle.svg class=footer__logo alt="Beam logo"></div><div class=footer__cols__col__logo><img src=/images/apache_logo_circle.svg class=footer__logo alt="Apache logo"></div></div><div class=footer-wrapper><div class=wrapper-grid><div class=footer__cols__col><div class=footer__cols__col__title>Start</div><div class=footer__cols__col__link><a href=/get-started/beam-overview/>Overview</a></div><div class=footer__cols__col__link><a href=/get-started/quickstart-java/>Quickstart (Java)</a></div><div class=footer__cols__col__link><a href=/get-started/quickstart-py/>Quickstart (Python)</a></div><div class=footer__cols__col__link><a href=/get-started/quickstart-go/>Quickstart (Go)</a></div><div class=footer__cols__col__link><a href=/get-started/downloads/>Downloads</a></div></div><div class=footer__cols__col><div class=footer__cols__col__title>Docs</div><div class=footer__cols__col__link><a href=/documentation/programming-guide/>Concepts</a></div><div class=footer__cols__col__link><a href=/documentation/pipelines/design-your-pipeline/>Pipelines</a></div><div class=footer__cols__col__link><a href=/documentation/runners/capability-matrix/>Runners</a></div></div><div class=footer__cols__col><div class=footer__cols__col__title>Community</div><div class=footer__cols__col__link><a href=/contribute/>Contribute</a></div><div class=footer__cols__col__link><a href=https://projects.apache.org/committee.html?beam target=_blank>Team<img src=/images/external-link-icon.png width=14 height=14 alt="External link."></a></div><div class=footer__cols__col__link><a href=/community/presentation-materials/>Media</a></div><div class=footer__cols__col__link><a href=/community/in-person/>Events/Meetups</a></div><div class=footer__cols__col__link><a href=/community/contact-us/>Contact Us</a></div></div><div class=footer__cols__col><div class=footer__cols__col__title>Resources</div><div class=footer__cols__col__link><a href=/blog/>Blog</a></div><div class=footer__cols__col__link><a href=https://github.com/apache/beam>GitHub</a></div></div></div><div class=footer__bottom>&copy;
<a href=https://www.apache.org>The Apache Software Foundation</a>
| <a href=/privacy_policy>Privacy Policy</a>
| <a href=/feed.xml>RSS Feed</a><br><br>Apache Beam, Apache, Beam, the Beam logo, and the Apache feather logo are either registered trademarks or trademarks of The Apache Software Foundation. All other products or name brands are trademarks of their respective holders, including The Apache Software Foundation.</div></div><div class="footer__cols__col footer__cols__col__logos"><div class=footer__cols__col--group><div class=footer__cols__col__logo><a href=https://github.com/apache/beam><img src=/images/logos/social-icons/github-logo-150.png class=footer__logo alt="Github logo"></a></div><div class=footer__cols__col__logo><a href=https://www.linkedin.com/company/apache-beam/><img src=/images/logos/social-icons/linkedin-logo-150.png class=footer__logo alt="Linkedin logo"></a></div></div><div class=footer__cols__col--group><div class=footer__cols__col__logo><a href=https://twitter.com/apachebeam><img src=/images/logos/social-icons/twitter-logo-150.png class=footer__logo alt="Twitter logo"></a></div><div class=footer__cols__col__logo><a href=https://www.youtube.com/channel/UChNnb_YO_7B0HlW6FhAXZZQ><img src=/images/logos/social-icons/youtube-logo-150.png class=footer__logo alt="Youtube logo"></a></div></div></div></div></div></footer></body></html>