blob: aa6ae57b5ec27d85fe3630f1d1acbe99d6ef6b98 [file] [log] [blame]
<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Lineage &mdash; Airflow Documentation</title>
<link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
<link rel="stylesheet" href="_static/pygments.css" type="text/css" />
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="Changelog" href="changelog.html" />
<link rel="prev" title="Kubernetes" href="kubernetes.html" />
<script src="_static/js/modernizr.min.js"></script>
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
_paq.push(['disableCookies']);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '13']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo -->
<link rel="canonical" href="https://airflow.apache.org/docs/apache-airflow/stable/lineage.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search">
<a href="index.html" class="icon icon-home"> Airflow
</a>
<div class="version">
1.10.2
</div>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
<input type="text" name="q" placeholder="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="project.html">Project</a></li>
<li class="toctree-l1"><a class="reference internal" href="license.html">License</a></li>
<li class="toctree-l1"><a class="reference internal" href="start.html">Quick Start</a></li>
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="tutorial.html">Tutorial</a></li>
<li class="toctree-l1"><a class="reference internal" href="howto/index.html">How-to Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="ui.html">UI / Screenshots</a></li>
<li class="toctree-l1"><a class="reference internal" href="concepts.html">Concepts</a></li>
<li class="toctree-l1"><a class="reference internal" href="profiling.html">Data Profiling</a></li>
<li class="toctree-l1"><a class="reference internal" href="cli.html">Command Line Interface</a></li>
<li class="toctree-l1"><a class="reference internal" href="scheduler.html">Scheduling &amp; Triggers</a></li>
<li class="toctree-l1"><a class="reference internal" href="plugins.html">Plugins</a></li>
<li class="toctree-l1"><a class="reference internal" href="security.html">Security</a></li>
<li class="toctree-l1"><a class="reference internal" href="timezone.html">Time zones</a></li>
<li class="toctree-l1"><a class="reference internal" href="api.html">Experimental Rest API</a></li>
<li class="toctree-l1"><a class="reference internal" href="integration.html">Integration</a></li>
<li class="toctree-l1"><a class="reference internal" href="metrics.html">Metrics</a></li>
<li class="toctree-l1"><a class="reference internal" href="kubernetes.html">Kubernetes</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Lineage</a><ul>
<li class="toctree-l2"><a class="reference internal" href="#apache-atlas">Apache Atlas</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="changelog.html">Changelog</a></li>
<li class="toctree-l1"><a class="reference internal" href="faq.html">FAQ</a></li>
<li class="toctree-l1"><a class="reference internal" href="code.html">API Reference</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" aria-label="top navigation">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="index.html">Airflow</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="index.html">Docs</a> &raquo;</li>
<li>Lineage</li>
<li class="wy-breadcrumbs-aside">
<a href="_sources/lineage.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<div class="section" id="lineage">
<h1>Lineage<a class="headerlink" href="#lineage" title="Permalink to this headline"></a></h1>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Lineage support is very experimental and subject to change.</p>
</div>
<p>Airflow can help track origins of data, what happens to it and where it moves over time. This can aid having
audit trails and data governance, but also debugging of data flows.</p>
<p>Airflow tracks data by means of inlets and outlets of the tasks. Let’s work from an example and see how it
works.</p>
<div class="code python highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">airflow.operators.bash_operator</span> <span class="k">import</span> <span class="n">BashOperator</span>
<span class="kn">from</span> <span class="nn">airflow.operators.dummy_operator</span> <span class="k">import</span> <span class="n">DummyOperator</span>
<span class="kn">from</span> <span class="nn">airflow.lineage.datasets</span> <span class="k">import</span> <span class="n">File</span>
<span class="kn">from</span> <span class="nn">airflow.models</span> <span class="k">import</span> <span class="n">DAG</span>
<span class="kn">from</span> <span class="nn">datetime</span> <span class="k">import</span> <span class="n">timedelta</span>
<span class="n">FILE_CATEGORIES</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;CAT1&quot;</span><span class="p">,</span> <span class="s2">&quot;CAT2&quot;</span><span class="p">,</span> <span class="s2">&quot;CAT3&quot;</span><span class="p">]</span>
<span class="n">args</span> <span class="o">=</span> <span class="p">{</span>
<span class="s1">&#39;owner&#39;</span><span class="p">:</span> <span class="s1">&#39;airflow&#39;</span><span class="p">,</span>
<span class="s1">&#39;start_date&#39;</span><span class="p">:</span> <span class="n">airflow</span><span class="o">.</span><span class="n">utils</span><span class="o">.</span><span class="n">dates</span><span class="o">.</span><span class="n">days_ago</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
<span class="p">}</span>
<span class="n">dag</span> <span class="o">=</span> <span class="n">DAG</span><span class="p">(</span>
<span class="n">dag_id</span><span class="o">=</span><span class="s1">&#39;example_lineage&#39;</span><span class="p">,</span> <span class="n">default_args</span><span class="o">=</span><span class="n">args</span><span class="p">,</span>
<span class="n">schedule_interval</span><span class="o">=</span><span class="s1">&#39;0 0 * * *&#39;</span><span class="p">,</span>
<span class="n">dagrun_timeout</span><span class="o">=</span><span class="n">timedelta</span><span class="p">(</span><span class="n">minutes</span><span class="o">=</span><span class="mi">60</span><span class="p">))</span>
<span class="n">f_final</span> <span class="o">=</span> <span class="n">File</span><span class="p">(</span><span class="s2">&quot;/tmp/final&quot;</span><span class="p">)</span>
<span class="n">run_this_last</span> <span class="o">=</span> <span class="n">DummyOperator</span><span class="p">(</span><span class="n">task_id</span><span class="o">=</span><span class="s1">&#39;run_this_last&#39;</span><span class="p">,</span> <span class="n">dag</span><span class="o">=</span><span class="n">dag</span><span class="p">,</span>
<span class="n">inlets</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;auto&quot;</span><span class="p">:</span> <span class="kc">True</span><span class="p">},</span>
<span class="n">outlets</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;datasets&quot;</span><span class="p">:</span> <span class="p">[</span><span class="n">f_final</span><span class="p">,]})</span>
<span class="n">f_in</span> <span class="o">=</span> <span class="n">File</span><span class="p">(</span><span class="s2">&quot;/tmp/whole_directory/&quot;</span><span class="p">)</span>
<span class="n">outlets</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">file</span> <span class="ow">in</span> <span class="n">FILE_CATEGORIES</span><span class="p">:</span>
<span class="n">f_out</span> <span class="o">=</span> <span class="n">File</span><span class="p">(</span><span class="s2">&quot;/tmp/</span><span class="si">{}</span><span class="s2">/{{{{ execution_date }}}}&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">file</span><span class="p">))</span>
<span class="n">outlets</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">f_out</span><span class="p">)</span>
<span class="n">run_this</span> <span class="o">=</span> <span class="n">BashOperator</span><span class="p">(</span>
<span class="n">task_id</span><span class="o">=</span><span class="s1">&#39;run_me_first&#39;</span><span class="p">,</span> <span class="n">bash_command</span><span class="o">=</span><span class="s1">&#39;echo 1&#39;</span><span class="p">,</span> <span class="n">dag</span><span class="o">=</span><span class="n">dag</span><span class="p">,</span>
<span class="n">inlets</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;datasets&quot;</span><span class="p">:</span> <span class="p">[</span><span class="n">f_in</span><span class="p">,]},</span>
<span class="n">outlets</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;datasets&quot;</span><span class="p">:</span> <span class="n">outlets</span><span class="p">}</span>
<span class="p">)</span>
<span class="n">run_this</span><span class="o">.</span><span class="n">set_downstream</span><span class="p">(</span><span class="n">run_this_last</span><span class="p">)</span>
</pre></div>
</div>
<p>Tasks take the parameters <cite>inlets</cite> and <cite>outlets</cite>. Inlets can be manually defined by a list of dataset <cite>{“datasets”:
[dataset1, dataset2]}</cite> or can be configured to look for outlets from upstream tasks <cite>{“task_ids”: [“task_id1”, “task_id2”]}</cite>
or can be configured to pick up outlets from direct upstream tasks <cite>{“auto”: True}</cite> or a combination of them. Outlets
are defined as list of dataset <cite>{“datasets”: [dataset1, dataset2]}</cite>. Any fields for the dataset are templated with
the context when the task is being executed.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Operators can add inlets and outlets automatically if the operator supports it.</p>
</div>
<p>In the example DAG task <cite>run_me_first</cite> is a BashOperator that takes 3 inlets: <cite>CAT1</cite>, <cite>CAT2</cite>, <cite>CAT3</cite>, that are
generated from a list. Note that <cite>execution_date</cite> is a templated field and will be rendered when the task is running.</p>
<div class="admonition note">
<p class="first admonition-title">Note</p>
<p class="last">Behind the scenes Airflow prepares the lineage metadata as part of the <cite>pre_execute</cite> method of a task. When the task
has finished execution <cite>post_execute</cite> is called and lineage metadata is pushed into XCOM. Thus if you are creating
your own operators that override this method make sure to decorate your method with <cite>prepare_lineage</cite> and <cite>apply_lineage</cite>
respectively.</p>
</div>
<div class="section" id="apache-atlas">
<h2>Apache Atlas<a class="headerlink" href="#apache-atlas" title="Permalink to this headline"></a></h2>
<p>Airflow can send its lineage metadata to Apache Atlas. You need to enable the <cite>atlas</cite> backend and configure it
properly, e.g. in your <cite>airflow.cfg</cite>:</p>
<div class="code python highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">[</span><span class="n">lineage</span><span class="p">]</span>
<span class="n">backend</span> <span class="o">=</span> <span class="n">airflow</span><span class="o">.</span><span class="n">lineage</span><span class="o">.</span><span class="n">backend</span><span class="o">.</span><span class="n">atlas</span>
<span class="p">[</span><span class="n">atlas</span><span class="p">]</span>
<span class="n">username</span> <span class="o">=</span> <span class="n">my_username</span>
<span class="n">password</span> <span class="o">=</span> <span class="n">my_password</span>
<span class="n">host</span> <span class="o">=</span> <span class="n">host</span>
<span class="n">port</span> <span class="o">=</span> <span class="mi">21000</span>
</pre></div>
</div>
<p>Please make sure to have the <cite>atlasclient</cite> package installed.</p>
</div>
</div>
</div>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="changelog.html" class="btn btn-neutral float-right" title="Changelog" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
<a href="kubernetes.html" class="btn btn-neutral" title="Kubernetes" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
</div>
<hr/>
<div role="contentinfo">
<p>
</p>
</div>
Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
<script type="text/javascript" src="_static/jquery.js"></script>
<script type="text/javascript" src="_static/underscore.js"></script>
<script type="text/javascript" src="_static/doctools.js"></script>
<script type="text/javascript" src="_static/language_data.js"></script>
<script type="text/javascript" src="_static/js/theme.js"></script>
<script type="text/javascript">
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>