blob: eab17bdc69521a9a15f06294f4d3b7a38c45f356 [file] [log] [blame]
<!--
Javascript to render AIRFLOW-XXX and PR references in text
as HTML links.
Overrides extrahead block from sphinx_rtd_theme
https://www.sphinx-doc.org/en/master/templating.html
-->
<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Best Practices &mdash; Airflow Documentation</title>
<link rel="shortcut icon" href="_static/pin_32.png"/>
<script type="text/javascript" src="_static/js/modernizr.min.js"></script>
<script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
<script type="text/javascript" src="_static/jquery.js"></script>
<script type="text/javascript" src="_static/underscore.js"></script>
<script type="text/javascript" src="_static/doctools.js"></script>
<script type="text/javascript" src="_static/language_data.js"></script>
<script type="text/javascript" src="_static/jira-links.js"></script>
<script type="text/javascript" src="_static/js/theme.js"></script>
<link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
<link rel="stylesheet" href="_static/pygments.css" type="text/css" />
<link rel="stylesheet" href="_static/graphviz.css" type="text/css" />
<link rel="stylesheet" href="_static/exampleinclude.css" type="text/css" />
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="FAQ" href="faq.html" />
<link rel="prev" title="Changelog" href="changelog.html" />
<script>
</script>
<style>
</style>
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="index.html" class="icon icon-home"> Airflow
</a>
<div class="version">
1.10.8
</div>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
<input type="text" name="q" placeholder="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="project.html">Project</a></li>
<li class="toctree-l1"><a class="reference internal" href="license.html">License</a></li>
<li class="toctree-l1"><a class="reference internal" href="start.html">Quick Start</a></li>
<li class="toctree-l1"><a class="reference internal" href="installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="tutorial.html">Tutorial</a></li>
<li class="toctree-l1"><a class="reference internal" href="howto/index.html">How-to Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="ui.html">UI / Screenshots</a></li>
<li class="toctree-l1"><a class="reference internal" href="concepts.html">Concepts</a></li>
<li class="toctree-l1"><a class="reference internal" href="profiling.html">Data Profiling</a></li>
<li class="toctree-l1"><a class="reference internal" href="cli.html">Command Line Interface Reference</a></li>
<li class="toctree-l1"><a class="reference internal" href="scheduler.html">Scheduling &amp; Triggers</a></li>
<li class="toctree-l1"><a class="reference internal" href="executor/index.html">Executor</a></li>
<li class="toctree-l1"><a class="reference internal" href="plugins.html">Plugins</a></li>
<li class="toctree-l1"><a class="reference internal" href="security.html">Security</a></li>
<li class="toctree-l1"><a class="reference internal" href="timezone.html">Time zones</a></li>
<li class="toctree-l1"><a class="reference internal" href="api.html">REST API Reference</a></li>
<li class="toctree-l1"><a class="reference internal" href="integration.html">Integration</a></li>
<li class="toctree-l1"><a class="reference internal" href="metrics.html">Metrics</a></li>
<li class="toctree-l1"><a class="reference internal" href="errors.html">Error Tracking</a></li>
<li class="toctree-l1"><a class="reference internal" href="kubernetes.html">Kubernetes</a></li>
<li class="toctree-l1"><a class="reference internal" href="lineage.html">Lineage</a></li>
<li class="toctree-l1"><a class="reference internal" href="dag-serialization.html">DAG Serialization</a></li>
<li class="toctree-l1"><a class="reference internal" href="changelog.html">Changelog</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Best Practices</a><ul>
<li class="toctree-l2"><a class="reference internal" href="#writing-a-dag">Writing a DAG</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#creating-a-task">Creating a task</a></li>
<li class="toctree-l3"><a class="reference internal" href="#deleting-a-task">Deleting a task</a></li>
<li class="toctree-l3"><a class="reference internal" href="#communication">Communication</a></li>
<li class="toctree-l3"><a class="reference internal" href="#variables">Variables</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#testing-a-dag">Testing a DAG</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#dag-loader-test">DAG Loader Test</a></li>
<li class="toctree-l3"><a class="reference internal" href="#unit-tests">Unit tests</a></li>
<li class="toctree-l3"><a class="reference internal" href="#self-checks">Self-Checks</a></li>
<li class="toctree-l3"><a class="reference internal" href="#staging-environment">Staging environment</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#deployment-in-production">Deployment in Production</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#database-backend">Database backend</a></li>
<li class="toctree-l3"><a class="reference internal" href="#multi-node-cluster">Multi-Node Cluster</a></li>
<li class="toctree-l3"><a class="reference internal" href="#logging">Logging</a></li>
<li class="toctree-l3"><a class="reference internal" href="#configuration">Configuration</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="faq.html">FAQ</a></li>
<li class="toctree-l1"><a class="reference internal" href="macros.html">Macros reference</a></li>
<li class="toctree-l1"><a class="reference internal" href="privacy_notice.html">Privacy Notice</a></li>
</ul>
<p class="caption"><span class="caption-text">References</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="_api/index.html">Python API</a></li>
<li class="toctree-l1"><a class="reference internal" href="configurations-ref.html">Configurations</a></li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" aria-label="top navigation">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="index.html">Airflow</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="index.html">Docs</a> &raquo;</li>
<li>Best Practices</li>
<li class="wy-breadcrumbs-aside">
<a href="_sources/best-practices.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<blockquote>
<div></div></blockquote>
<div class="section" id="best-practices">
<h1>Best Practices<a class="headerlink" href="#best-practices" title="Permalink to this headline"></a></h1>
<p>Running Airflow in production is seamless. It comes bundled with all the plugins and configs
necessary to run most of the DAGs. However, you can come across certain pitfalls, which can cause occasional errors.
Let’s take a look at what you need to do at various stages to avoid these pitfalls, starting from writing the DAG
to the actual deployment in the production environment.</p>
<div class="section" id="writing-a-dag">
<h2>Writing a DAG<a class="headerlink" href="#writing-a-dag" title="Permalink to this headline"></a></h2>
<p>Creating a new DAG in Airflow is quite simple. However, there are many things that you need to take care of
to ensure the DAG run or failure does not produce unexpected results.</p>
<div class="section" id="creating-a-task">
<h3>Creating a task<a class="headerlink" href="#creating-a-task" title="Permalink to this headline"></a></h3>
<p>You should treat tasks in Airflow equivalent to transactions in a database. It implies that you should never produce
incomplete results from your tasks. An example is not to produce incomplete data in <code class="docutils literal notranslate"><span class="pre">HDFS</span></code> or <code class="docutils literal notranslate"><span class="pre">S3</span></code> at the end of a task.</p>
<p>Airflow can retry a task if it fails. Thus, the tasks should produce the same outcome on every re-run.
Some of the ways you can avoid producing a different result -</p>
<ul class="simple">
<li><p>Do not use INSERT during a task re-run, an INSERT statement might lead to duplicate rows in your database.
Replace it with UPSERT.</p></li>
<li><p>Read and write in a specific partition. Never read the latest available data in a task.
Someone may update the input data between re-runs, which results in different outputs.
A better way is to read the input data from a specific partition. You can use <code class="docutils literal notranslate"><span class="pre">execution_date</span></code> as a partition.
You should follow this partitioning method while writing data in S3/HDFS, as well.</p></li>
<li><p>The python datetime <code class="docutils literal notranslate"><span class="pre">now()</span></code> function gives the current datetime object.
This function should never be used inside a task, especially to do the critical computation, as it leads to different outcomes on each run.
It’s fine to use it, for example, to generate a temporary log.</p></li>
</ul>
<div class="admonition tip">
<p class="admonition-title">Tip</p>
<p>You should define repetitive parameters such as <code class="docutils literal notranslate"><span class="pre">connection_id</span></code> or S3 paths in <code class="docutils literal notranslate"><span class="pre">default_args</span></code> rather than declaring them for each task.
The <code class="docutils literal notranslate"><span class="pre">default_args</span></code> help to avoid mistakes such as typographical errors.</p>
</div>
</div>
<div class="section" id="deleting-a-task">
<h3>Deleting a task<a class="headerlink" href="#deleting-a-task" title="Permalink to this headline"></a></h3>
<p>Never delete a task from a DAG. In case of deletion, the historical information of the task disappears from the Airflow UI.
It is advised to create a new DAG in case the tasks need to be deleted.</p>
</div>
<div class="section" id="communication">
<h3>Communication<a class="headerlink" href="#communication" title="Permalink to this headline"></a></h3>
<p>Airflow executes tasks of a DAG on different servers in case you are using <a class="reference internal" href="executor/kubernetes.html"><span class="doc">Kubernetes executor</span></a> or <a class="reference internal" href="executor/celery.html"><span class="doc">Celery executor</span></a>.
Therefore, you should not store any file or config in the local filesystem as the next task is likely to run on a different server without access to it — for example, a task that downloads the data file that the next task processes.
In the case of <a class="reference internal" href="_api/airflow/executors/local_executor/index.html#airflow.executors.local_executor.LocalExecutor" title="airflow.executors.local_executor.LocalExecutor"><code class="xref py py-class docutils literal notranslate"><span class="pre">Local</span> <span class="pre">executor</span></code></a>,
storing a file on disk can make retries harder e.g., your task requires a config file that is deleted by another task in DAG.</p>
<p>If possible, use <code class="docutils literal notranslate"><span class="pre">XCom</span></code> to communicate small messages between tasks and a good way of passing larger data between tasks is to use a remote storage such as S3/HDFS.
For example, if we have a task that stores processed data in S3 that task can push the S3 path for the output data in <code class="docutils literal notranslate"><span class="pre">Xcom</span></code>,
and the downstream tasks can pull the path from XCom and use it to read the data.</p>
<p>The tasks should also not store any authentication parameters such as passwords or token inside them.
Where at all possible, use <a class="reference internal" href="concepts.html#concepts-connections"><span class="std std-ref">Connections</span></a> to store data securely in Airflow backend and retrieve them using a unique connection id.</p>
</div>
<div class="section" id="variables">
<h3>Variables<a class="headerlink" href="#variables" title="Permalink to this headline"></a></h3>
<p>You should avoid usage of Variables outside an operator’s <code class="docutils literal notranslate"><span class="pre">execute()</span></code> method or Jinja templates if possible,
as Variables create a connection to metadata DB of Airflow to fetch the value, which can slow down parsing and place extra load on the DB.</p>
<p>Airflow parses all the DAGs in the background at a specific period.
The default period is set using <code class="docutils literal notranslate"><span class="pre">processor_poll_interval</span></code> config, which is by default 1 second. During parsing, Airflow creates a new connection to the metadata DB for each DAG.
It can result in a lot of open connections.</p>
<p>The best way of using variables is via a Jinja template which will delay reading the value until the task execution. The template synaxt to do this is:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">{{</span> <span class="n">var</span><span class="o">.</span><span class="n">value</span><span class="o">.&lt;</span><span class="n">variable_name</span><span class="o">&gt;</span> <span class="p">}}</span>
</pre></div>
</div>
<p>or if you need to deserialize a json object from the variable :</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">{{</span> <span class="n">var</span><span class="o">.</span><span class="n">json</span><span class="o">.&lt;</span><span class="n">variable_name</span><span class="o">&gt;</span> <span class="p">}}</span>
</pre></div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>In general, you should not write any code outside the tasks. The code outside the tasks runs every time Airflow parses the DAG, which happens every second by default.</p>
</div>
</div>
</div>
<div class="section" id="testing-a-dag">
<h2>Testing a DAG<a class="headerlink" href="#testing-a-dag" title="Permalink to this headline"></a></h2>
<p>Airflow users should treat DAGs as production level code. The DAGs should have various tests to ensure that it produces expected results.
You can write a wide variety of tests for a DAG. Let’s take a look at some of them.</p>
<div class="section" id="dag-loader-test">
<h3>DAG Loader Test<a class="headerlink" href="#dag-loader-test" title="Permalink to this headline"></a></h3>
<p>This test should ensure that your DAG does not contain a piece of code that raises error while loading.
No additional code needs to be written by the user to run this test.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">python</span> <span class="n">your</span><span class="o">-</span><span class="n">dag</span><span class="o">-</span><span class="n">file</span><span class="o">.</span><span class="n">py</span>
</pre></div>
</div>
<p>Running the above command without any error ensures your DAG does not contain any uninstalled dependency, syntax errors, etc.</p>
<p>You can look into <a class="reference internal" href="tutorial.html#testing"><span class="std std-ref">Testing a DAG</span></a> for details on how to test individual operators.</p>
</div>
<div class="section" id="unit-tests">
<h3>Unit tests<a class="headerlink" href="#unit-tests" title="Permalink to this headline"></a></h3>
<p>Unit tests ensure that there is no incorrect code in your DAG. You can write a unit test for your tasks as well as your DAG.</p>
<p><strong>Unit test for loading a DAG:</strong></p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">airflow.models</span> <span class="kn">import</span> <span class="n">DagBag</span>
<span class="kn">import</span> <span class="nn">unittest</span>
<span class="k">class</span> <span class="nc">TestHelloWorldDAG</span><span class="p">(</span><span class="n">unittest</span><span class="o">.</span><span class="n">TestCase</span><span class="p">):</span>
<span class="nd">@classmethod</span>
<span class="k">def</span> <span class="nf">setUpClass</span><span class="p">(</span><span class="bp">cls</span><span class="p">):</span>
<span class="bp">cls</span><span class="o">.</span><span class="n">dagbag</span> <span class="o">=</span> <span class="n">DagBag</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">test_dag_loaded</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="n">dag</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">dagbag</span><span class="o">.</span><span class="n">get_dag</span><span class="p">(</span><span class="n">dag_id</span><span class="o">=</span><span class="s1">&#39;hello_world&#39;</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertDictEqual</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dagbag</span><span class="o">.</span><span class="n">import_errors</span><span class="p">,</span> <span class="p">{})</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertIsNotNone</span><span class="p">(</span><span class="n">dag</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">dag</span><span class="o">.</span><span class="n">tasks</span><span class="p">),</span> <span class="mi">1</span><span class="p">)</span>
</pre></div>
</div>
<p><strong>Unit test for custom operator:</strong></p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">unittest</span>
<span class="kn">from</span> <span class="nn">airflow.utils.state</span> <span class="kn">import</span> <span class="n">State</span>
<span class="n">DEFAULT_DATE</span> <span class="o">=</span> <span class="s1">&#39;2019-10-03&#39;</span>
<span class="n">TEST_DAG_ID</span> <span class="o">=</span> <span class="s1">&#39;test_my_custom_operator&#39;</span>
<span class="k">class</span> <span class="nc">MyCustomOperatorTest</span><span class="p">(</span><span class="n">unittest</span><span class="o">.</span><span class="n">TestCase</span><span class="p">):</span>
<span class="k">def</span> <span class="nf">setUp</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">dag</span> <span class="o">=</span> <span class="n">DAG</span><span class="p">(</span><span class="n">TEST_DAG_ID</span><span class="p">,</span> <span class="n">schedule_interval</span><span class="o">=</span><span class="s1">&#39;@daily&#39;</span><span class="p">,</span> <span class="n">default_args</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;start_date&#39;</span> <span class="p">:</span> <span class="n">DEFAULT_DATE</span><span class="p">})</span>
<span class="bp">self</span><span class="o">.</span><span class="n">op</span> <span class="o">=</span> <span class="n">MyCustomOperator</span><span class="p">(</span>
<span class="n">dag</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">dag</span><span class="p">,</span>
<span class="n">task_id</span><span class="o">=</span><span class="s1">&#39;test&#39;</span><span class="p">,</span>
<span class="n">prefix</span><span class="o">=</span><span class="s1">&#39;s3://bucket/some/prefix&#39;</span><span class="p">,</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">ti</span> <span class="o">=</span> <span class="n">TaskInstance</span><span class="p">(</span><span class="n">task</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">op</span><span class="p">,</span> <span class="n">execution_date</span><span class="o">=</span><span class="n">DEFAULT_DATE</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">test_execute_no_trigger</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">ti</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">ignore_ti_state</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">assertEqual</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">ti</span><span class="o">.</span><span class="n">state</span><span class="p">,</span> <span class="n">State</span><span class="o">.</span><span class="n">SUCCESS</span><span class="p">)</span>
<span class="c1">#Assert something related to tasks results</span>
</pre></div>
</div>
</div>
<div class="section" id="self-checks">
<h3>Self-Checks<a class="headerlink" href="#self-checks" title="Permalink to this headline"></a></h3>
<p>You can also implement checks in a DAG to make sure the tasks are producing the results as expected.
As an example, if you have a task that pushes data to S3, you can implement a check in the next task. For example, the check could
make sure that the partition is created in S3 and perform some simple checks to see if the data is correct or not.</p>
<p>Similarly, if you have a task that starts a microservice in Kubernetes or Mesos, you should check if the service has started or not using <a class="reference internal" href="_api/airflow/sensors/http_sensor/index.html#airflow.sensors.http_sensor.HttpSensor" title="airflow.sensors.http_sensor.HttpSensor"><code class="xref py py-class docutils literal notranslate"><span class="pre">airflow.sensors.http_sensor.HttpSensor</span></code></a>.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">task</span> <span class="o">=</span> <span class="n">PushToS3</span><span class="p">(</span><span class="o">...</span><span class="p">)</span>
<span class="n">check</span> <span class="o">=</span> <span class="n">S3KeySensor</span><span class="p">(</span>
<span class="n">task_id</span><span class="o">=</span><span class="s1">&#39;check_parquet_exists&#39;</span><span class="p">,</span>
<span class="n">bucket_key</span><span class="o">=</span><span class="s2">&quot;s3://bucket/key/foo.parquet&quot;</span><span class="p">,</span>
<span class="n">poke_interval</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
<span class="n">timeout</span><span class="o">=</span><span class="mi">0</span>
<span class="p">)</span>
<span class="n">task</span> <span class="o">&gt;&gt;</span> <span class="n">check</span>
</pre></div>
</div>
</div>
<div class="section" id="staging-environment">
<h3>Staging environment<a class="headerlink" href="#staging-environment" title="Permalink to this headline"></a></h3>
<p>If possible, keep a staging environment to test the complete DAG run before deploying in the production.
Make sure your DAG is parameterized to change the variables, e.g., the output path of S3 operation or the database used to read the configuration.
Do not hard code values inside the DAG and then change them manually according to the environment.</p>
<p>You can use environment variables to parameterize the DAG.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">os</span>
<span class="n">dest</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span>
<span class="s2">&quot;MY_DAG_DEST_PATH&quot;</span><span class="p">,</span>
<span class="s2">&quot;s3://default-target/path/&quot;</span>
<span class="p">)</span>
</pre></div>
</div>
</div>
</div>
<div class="section" id="deployment-in-production">
<h2>Deployment in Production<a class="headerlink" href="#deployment-in-production" title="Permalink to this headline"></a></h2>
<p>Once you have completed all the mentioned checks, it is time to deploy your DAG in production.
To do this, first, you need to make sure that the Airflow is itself production-ready.
Let’s see what precautions you need to take.</p>
<div class="section" id="database-backend">
<h3>Database backend<a class="headerlink" href="#database-backend" title="Permalink to this headline"></a></h3>
<p>Airflow comes with an <code class="docutils literal notranslate"><span class="pre">SQLite</span></code> backend by default. It allows the user to run Airflow without any external database.
However, such a setup is meant to be for testing purposes only. Running the default setup can lead to data loss in multiple scenarios.
If you want to run Airflow in production, make sure you <a class="reference internal" href="howto/initialize-database.html"><span class="doc">configure the backend</span></a> to be an external database such as PostgreSQL or MySQL.</p>
<p>You can change the backend using the following config</p>
<div class="highlight-ini notranslate"><div class="highlight"><pre><span></span><span class="k">[core]</span>
<span class="na">sql_alchemy_conn</span> <span class="o">=</span> <span class="s">my_conn_string</span>
</pre></div>
</div>
<p>Once you have changed the backend, airflow needs to create all the tables required for operation.
Create an empty DB and give airflow’s user the permission to <code class="docutils literal notranslate"><span class="pre">CREATE/ALTER</span></code> it.
Once that is done, you can run -</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">airflow</span> <span class="n">db</span> <span class="n">upgrade</span>
</pre></div>
</div>
<p><code class="docutils literal notranslate"><span class="pre">upgrade</span></code> keeps track of migrations already applies, so it’s safe to run as often as you need.</p>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Do not use <code class="docutils literal notranslate"><span class="pre">airflow</span> <span class="pre">db</span> <span class="pre">init</span></code> as it can create a lot of default connection, charts, etc. which are not required in production DB.</p>
</div>
</div>
<div class="section" id="multi-node-cluster">
<h3>Multi-Node Cluster<a class="headerlink" href="#multi-node-cluster" title="Permalink to this headline"></a></h3>
<p>Airflow uses <a class="reference internal" href="_api/airflow/executors/sequential_executor/index.html#airflow.executors.sequential_executor.SequentialExecutor" title="airflow.executors.sequential_executor.SequentialExecutor"><code class="xref py py-class docutils literal notranslate"><span class="pre">airflow.executors.sequential_executor.SequentialExecutor</span></code></a> by default. However, by its nature, the user is limited to executing at most
one task at a time. <code class="docutils literal notranslate"><span class="pre">Sequential</span> <span class="pre">Executor</span></code> also pauses the scheduler when it runs a task, hence not recommended in a production setup.
You should use <a class="reference internal" href="_api/airflow/executors/local_executor/index.html#airflow.executors.local_executor.LocalExecutor" title="airflow.executors.local_executor.LocalExecutor"><code class="xref py py-class docutils literal notranslate"><span class="pre">Local</span> <span class="pre">executor</span></code></a> for a single machine.
For multi-node setup, you should use <a class="reference internal" href="executor/kubernetes.html"><span class="doc">Kubernetes executor</span></a> or <a class="reference internal" href="executor/celery.html"><span class="doc">Celery executor</span></a>.</p>
<p>Once you have configured the executor, it is necessary to make sure that every node in the cluster contains the same configuration and dags.
Airflow only sends simple instructions such as “execute task X of dag Y” but does not send any dag files or configuration. You can use a simple cronjob or
any other mechanism to sync DAGs and configs across your nodes, e.g., checkout DAGs from git repo every 5 minutes on all nodes.</p>
</div>
<div class="section" id="logging">
<h3>Logging<a class="headerlink" href="#logging" title="Permalink to this headline"></a></h3>
<p>If you are using disposable nodes in your cluster, configure the log storage to be a distributed file system (DFS) such as <code class="docutils literal notranslate"><span class="pre">S3</span></code> and <code class="docutils literal notranslate"><span class="pre">GCS</span></code>, or external services such as
Stackdriver Logging, Elasticsearch or Amazon CloudWatch.
This way, the logs are available even after the node goes down or gets replaced. See <a class="reference internal" href="howto/write-logs.html"><span class="doc">Writing Logs</span></a> for configurations.</p>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>The logs only appear in your DFS after the task has finished. You can view the logs while the task is running in UI itself.</p>
</div>
</div>
<div class="section" id="configuration">
<h3>Configuration<a class="headerlink" href="#configuration" title="Permalink to this headline"></a></h3>
<p>Airflow comes bundles with a default <code class="docutils literal notranslate"><span class="pre">airflow.cfg</span></code> configuration file.
You should use environment variables for configurations that change across deployments
e.g. metadata DB, password. You can do it using the format <code class="docutils literal notranslate"><span class="pre">$AIRFLOW__{SECTION}__{KEY}</span></code></p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">AIRFLOW__CORE__SQL_ALCHEMY_CONN</span><span class="o">=</span><span class="n">my_conn_id</span>
<span class="n">AIRFLOW__WEBSERVER__BASE_URL</span><span class="o">=</span><span class="n">http</span><span class="p">:</span><span class="o">//</span><span class="n">host</span><span class="p">:</span><span class="n">port</span>
</pre></div>
</div>
<p>Some configurations such as Airflow Backend connection URI can be derived from bash commands as well:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">sql_alchemy_conn_cmd</span> <span class="o">=</span> <span class="n">bash_command_to_run</span>
</pre></div>
</div>
</div>
</div>
</div>
</div>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="faq.html" class="btn btn-neutral float-right" title="FAQ" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
<a href="changelog.html" class="btn btn-neutral float-left" title="Changelog" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
</div>
<hr/>
<div role="contentinfo">
<p>
</p>
</div>
Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
<div class="footer">This page uses <a href="https://analytics.google.com/">
Google Analytics</a> to collect statistics. You can disable it by blocking
the JavaScript coming from www.google-analytics.com. Check our
<a href="privacy_notice.html">Privacy Policy</a>
for more details.
</div>
</footer>
</div>
</div>
</section>
</div>
<script type="text/javascript">
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
<!-- Theme Analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-140539454-1', 'auto');
ga('send', 'pageview');
</script>
</body>
</html>