blob: 0128c66a350777fc7aa0d917a6a9383f348e35fd [file] [log] [blame]
<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>airflow.contrib.operators.spark_jdbc_operator &mdash; Airflow Documentation</title>
<script type="text/javascript" src="../../../../../_static/js/modernizr.min.js"></script>
<script type="text/javascript" id="documentation_options" data-url_root="../../../../../" src="../../../../../_static/documentation_options.js"></script>
<script type="text/javascript" src="../../../../../_static/jquery.js"></script>
<script type="text/javascript" src="../../../../../_static/underscore.js"></script>
<script type="text/javascript" src="../../../../../_static/doctools.js"></script>
<script type="text/javascript" src="../../../../../_static/language_data.js"></script>
<script type="text/javascript" src="../../../../../_static/js/theme.js"></script>
<link rel="stylesheet" href="../../../../../_static/css/theme.css" type="text/css" />
<link rel="stylesheet" href="../../../../../_static/pygments.css" type="text/css" />
<link rel="index" title="Index" href="../../../../../genindex.html" />
<link rel="search" title="Search" href="../../../../../search.html" />
<link rel="next" title="airflow.contrib.operators.spark_sql_operator" href="../spark_sql_operator/index.html" />
<link rel="prev" title="airflow.contrib.operators.sns_publish_operator" href="../sns_publish_operator/index.html" />
<script>
document.addEventListener('DOMContentLoaded', function() {
var el = document.getElementById('changelog');
if (el !== null ) {
// [AIRFLOW-...]
el.innerHTML = el.innerHTML.replace(
/\[(AIRFLOW-[\d]+)\]/g,
`<a href="https://issues.apache.org/jira/browse/$1">[$1]</a>`
);
// (#...)
el.innerHTML = el.innerHTML.replace(
/\(#([\d]+)\)/g,
`<a href="https://github.com/apache/airflow/pull/$1">(#$1)</a>`
);
};
})
</script>
<style>
.example-header {
position: relative;
background: #9AAA7A;
padding: 8px 16px;
margin-bottom: 0;
}
.example-header--with-button {
padding-right: 166px;
}
.example-header:after{
content: '';
display: table;
clear: both;
}
.example-title {
display:block;
padding: 4px;
margin-right: 16px;
color: white;
overflow-x: auto;
}
.example-header-button {
top: 8px;
right: 16px;
position: absolute;
}
.example-header + .highlight-python {
margin-top: 0 !important;
}
.viewcode-button {
display: inline-block;
padding: 8px 16px;
border: 0;
margin: 0;
outline: 0;
border-radius: 2px;
-webkit-box-shadow: 0 3px 5px 0 rgba(0,0,0,.3);
box-shadow: 0 3px 6px 0 rgba(0,0,0,.3);
color: #404040;
background-color: #e7e7e7;
cursor: pointer;
font-size: 16px;
font-weight: 500;
line-height: 1;
text-decoration: none;
text-overflow: ellipsis;
overflow: hidden;
text-transform: uppercase;
-webkit-transition: background-color .2s;
transition: background-color .2s;
vertical-align: middle;
white-space: nowrap;
}
.viewcode-button:visited {
color: #404040;
}
.viewcode-button:hover, .viewcode-button:focus {
color: #404040;
background-color: #d6d6d6;
}
</style>
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="../../../../../index.html" class="icon icon-home"> Airflow
</a>
<div class="version">
1.10.4
</div>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../../../../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../../../../../project.html">Project</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../license.html">License</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../start.html">Quick Start</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../tutorial.html">Tutorial</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../howto/index.html">How-to Guides</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../ui.html">UI / Screenshots</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../concepts.html">Concepts</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../profiling.html">Data Profiling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../cli.html">Command Line Interface</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../scheduler.html">Scheduling &amp; Triggers</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../plugins.html">Plugins</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../security.html">Security</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../timezone.html">Time zones</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../api.html">Experimental Rest API</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../integration.html">Integration</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../metrics.html">Metrics</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../kubernetes.html">Kubernetes</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../lineage.html">Lineage</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../changelog.html">Changelog</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../faq.html">FAQ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../../../macros.html">Macros reference</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="../../../../index.html">API Reference</a><ul class="current">
<li class="toctree-l2 current"><a class="reference internal" href="../../../../index.html#operators">Operators</a><ul class="current">
<li class="toctree-l3"><a class="reference internal" href="../../../../index.html#baseoperator">BaseOperator</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../../../index.html#basesensoroperator">BaseSensorOperator</a></li>
<li class="toctree-l3 current"><a class="reference internal" href="../../../../index.html#operators-packages">Operators packages</a><ul class="current">
<li class="toctree-l4"><a class="reference internal" href="../../../operators/index.html"><code class="xref py py-mod docutils literal notranslate"><span class="pre">airflow.operators</span></code></a></li>
<li class="toctree-l4"><a class="reference internal" href="../../../sensors/index.html"><code class="xref py py-mod docutils literal notranslate"><span class="pre">airflow.sensors</span></code></a></li>
<li class="toctree-l4 current"><a class="reference internal" href="../index.html"><code class="xref py py-mod docutils literal notranslate"><span class="pre">airflow.contrib.operators</span></code></a></li>
<li class="toctree-l4"><a class="reference internal" href="../../sensors/index.html"><code class="xref py py-mod docutils literal notranslate"><span class="pre">airflow.contrib.sensors</span></code></a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../../../../index.html#hooks">Hooks</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../index.html#executors">Executors</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../index.html#models">Models</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../../../index.html#core-and-community-package">Core and community package</a></li>
</ul>
</li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" aria-label="top navigation">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../../../../../index.html">Airflow</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="../../../../../index.html">Docs</a> &raquo;</li>
<li><a href="../../../../index.html">API Reference</a> &raquo;</li>
<li><a href="../index.html"><code class="xref py py-mod docutils literal notranslate"><span class="pre">airflow.contrib.operators</span></code></a> &raquo;</li>
<li><code class="xref py py-mod docutils literal notranslate"><span class="pre">airflow.contrib.operators.spark_jdbc_operator</span></code></li>
<li class="wy-breadcrumbs-aside">
<a href="../../../../../_sources/_api/airflow/contrib/operators/spark_jdbc_operator/index.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<div class="section" id="module-airflow.contrib.operators.spark_jdbc_operator">
<span id="airflow-contrib-operators-spark-jdbc-operator"></span><h1><a class="reference internal" href="#module-airflow.contrib.operators.spark_jdbc_operator" title="airflow.contrib.operators.spark_jdbc_operator"><code class="xref py py-mod docutils literal notranslate"><span class="pre">airflow.contrib.operators.spark_jdbc_operator</span></code></a><a class="headerlink" href="#module-airflow.contrib.operators.spark_jdbc_operator" title="Permalink to this headline"></a></h1>
<div class="section" id="module-contents">
<h2>Module Contents<a class="headerlink" href="#module-contents" title="Permalink to this headline"></a></h2>
<dl class="class">
<dt id="airflow.contrib.operators.spark_jdbc_operator.SparkJDBCOperator">
<em class="property">class </em><code class="sig-prename descclassname">airflow.contrib.operators.spark_jdbc_operator.</code><code class="sig-name descname">SparkJDBCOperator</code><span class="sig-paren">(</span><em class="sig-param">spark_app_name='airflow-spark-jdbc'</em>, <em class="sig-param">spark_conn_id='spark-default'</em>, <em class="sig-param">spark_conf=None</em>, <em class="sig-param">spark_py_files=None</em>, <em class="sig-param">spark_files=None</em>, <em class="sig-param">spark_jars=None</em>, <em class="sig-param">num_executors=None</em>, <em class="sig-param">executor_cores=None</em>, <em class="sig-param">executor_memory=None</em>, <em class="sig-param">driver_memory=None</em>, <em class="sig-param">verbose=False</em>, <em class="sig-param">keytab=None</em>, <em class="sig-param">principal=None</em>, <em class="sig-param">cmd_type='spark_to_jdbc'</em>, <em class="sig-param">jdbc_table=None</em>, <em class="sig-param">jdbc_conn_id='jdbc-default'</em>, <em class="sig-param">jdbc_driver=None</em>, <em class="sig-param">metastore_table=None</em>, <em class="sig-param">jdbc_truncate=False</em>, <em class="sig-param">save_mode=None</em>, <em class="sig-param">save_format=None</em>, <em class="sig-param">batch_size=None</em>, <em class="sig-param">fetch_size=None</em>, <em class="sig-param">num_partitions=None</em>, <em class="sig-param">partition_column=None</em>, <em class="sig-param">lower_bound=None</em>, <em class="sig-param">upper_bound=None</em>, <em class="sig-param">create_table_column_types=None</em>, <em class="sig-param">*args</em>, <em class="sig-param">**kwargs</em><span class="sig-paren">)</span><a class="reference internal" href="../../../../../_modules/airflow/contrib/operators/spark_jdbc_operator.html#SparkJDBCOperator"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#airflow.contrib.operators.spark_jdbc_operator.SparkJDBCOperator" title="Permalink to this definition"></a></dt>
<dd><p>Bases: <a class="reference internal" href="../spark_submit_operator/index.html#airflow.contrib.operators.spark_submit_operator.SparkSubmitOperator" title="airflow.contrib.operators.spark_submit_operator.SparkSubmitOperator"><code class="xref py py-class docutils literal notranslate"><span class="pre">airflow.contrib.operators.spark_submit_operator.SparkSubmitOperator</span></code></a></p>
<p>This operator extends the SparkSubmitOperator specifically for performing data
transfers to/from JDBC-based databases with Apache Spark. As with the
SparkSubmitOperator, it assumes that the “spark-submit” binary is available on the
PATH.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>spark_app_name</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.7)"><em>str</em></a>) – Name of the job (default airflow-spark-jdbc)</p></li>
<li><p><strong>spark_conn_id</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.7)"><em>str</em></a>) – Connection id as configured in Airflow administration</p></li>
<li><p><strong>spark_conf</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#dict" title="(in Python v3.7)"><em>dict</em></a>) – Any additional Spark configuration properties</p></li>
<li><p><strong>spark_py_files</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.7)"><em>str</em></a>) – Additional python files used (.zip, .egg, or .py)</p></li>
<li><p><strong>spark_files</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.7)"><em>str</em></a>) – Additional files to upload to the container running the job</p></li>
<li><p><strong>spark_jars</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.7)"><em>str</em></a>) – Additional jars to upload and add to the driver and
executor classpath</p></li>
<li><p><strong>num_executors</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.7)"><em>int</em></a>) – number of executor to run. This should be set so as to manage
the number of connections made with the JDBC database</p></li>
<li><p><strong>executor_cores</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.7)"><em>int</em></a>) – Number of cores per executor</p></li>
<li><p><strong>executor_memory</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.7)"><em>str</em></a>) – Memory per executor (e.g. 1000M, 2G)</p></li>
<li><p><strong>driver_memory</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.7)"><em>str</em></a>) – Memory allocated to the driver (e.g. 1000M, 2G)</p></li>
<li><p><strong>verbose</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.7)"><em>bool</em></a>) – Whether to pass the verbose flag to spark-submit for debugging</p></li>
<li><p><strong>keytab</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.7)"><em>str</em></a>) – Full path to the file that contains the keytab</p></li>
<li><p><strong>principal</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.7)"><em>str</em></a>) – The name of the kerberos principal used for keytab</p></li>
<li><p><strong>cmd_type</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.7)"><em>str</em></a>) – Which way the data should flow. 2 possible values:
spark_to_jdbc: data written by spark from metastore to jdbc
jdbc_to_spark: data written by spark from jdbc to metastore</p></li>
<li><p><strong>jdbc_table</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.7)"><em>str</em></a>) – The name of the JDBC table</p></li>
<li><p><strong>jdbc_conn_id</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.7)"><em>str</em></a>) – Connection id used for connection to JDBC database</p></li>
<li><p><strong>jdbc_driver</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.7)"><em>str</em></a>) – Name of the JDBC driver to use for the JDBC connection. This
driver (usually a jar) should be passed in the ‘jars’ parameter</p></li>
<li><p><strong>metastore_table</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.7)"><em>str</em></a>) – The name of the metastore table,</p></li>
<li><p><strong>jdbc_truncate</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.7)"><em>bool</em></a>) – (spark_to_jdbc only) Whether or not Spark should truncate or
drop and recreate the JDBC table. This only takes effect if
‘save_mode’ is set to Overwrite. Also, if the schema is
different, Spark cannot truncate, and will drop and recreate</p></li>
<li><p><strong>save_mode</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.7)"><em>str</em></a>) – The Spark save-mode to use (e.g. overwrite, append, etc.)</p></li>
<li><p><strong>save_format</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.7)"><em>str</em></a>) – (jdbc_to_spark-only) The Spark save-format to use (e.g. parquet)</p></li>
<li><p><strong>batch_size</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.7)"><em>int</em></a>) – (spark_to_jdbc only) The size of the batch to insert per round
trip to the JDBC database. Defaults to 1000</p></li>
<li><p><strong>fetch_size</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.7)"><em>int</em></a>) – (jdbc_to_spark only) The size of the batch to fetch per round trip
from the JDBC database. Default depends on the JDBC driver</p></li>
<li><p><strong>num_partitions</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.7)"><em>int</em></a>) – The maximum number of partitions that can be used by Spark
simultaneously, both for spark_to_jdbc and jdbc_to_spark
operations. This will also cap the number of JDBC connections
that can be opened</p></li>
<li><p><strong>partition_column</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.7)"><em>str</em></a>) – (jdbc_to_spark-only) A numeric column to be used to
partition the metastore table by. If specified, you must
also specify:
num_partitions, lower_bound, upper_bound</p></li>
<li><p><strong>lower_bound</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.7)"><em>int</em></a>) – (jdbc_to_spark-only) Lower bound of the range of the numeric
partition column to fetch. If specified, you must also specify:
num_partitions, partition_column, upper_bound</p></li>
<li><p><strong>upper_bound</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.7)"><em>int</em></a>) – (jdbc_to_spark-only) Upper bound of the range of the numeric
partition column to fetch. If specified, you must also specify:
num_partitions, partition_column, lower_bound</p></li>
<li><p><strong>create_table_column_types</strong> – (spark_to_jdbc-only) The database column data types
to use instead of the defaults, when creating the
table. Data type information should be specified in
the same format as CREATE TABLE columns syntax
(e.g: “name CHAR(64), comments VARCHAR(1024)”).
The specified types should be valid spark sql data
types.</p></li>
</ul>
</dd>
</dl>
<dl class="method">
<dt id="airflow.contrib.operators.spark_jdbc_operator.SparkJDBCOperator.execute">
<code class="sig-name descname">execute</code><span class="sig-paren">(</span><em class="sig-param">self</em>, <em class="sig-param">context</em><span class="sig-paren">)</span><a class="reference internal" href="../../../../../_modules/airflow/contrib/operators/spark_jdbc_operator.html#SparkJDBCOperator.execute"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#airflow.contrib.operators.spark_jdbc_operator.SparkJDBCOperator.execute" title="Permalink to this definition"></a></dt>
<dd><p>Call the SparkSubmitHook to run the provided spark job</p>
</dd></dl>
<dl class="method">
<dt id="airflow.contrib.operators.spark_jdbc_operator.SparkJDBCOperator.on_kill">
<code class="sig-name descname">on_kill</code><span class="sig-paren">(</span><em class="sig-param">self</em><span class="sig-paren">)</span><a class="reference internal" href="../../../../../_modules/airflow/contrib/operators/spark_jdbc_operator.html#SparkJDBCOperator.on_kill"><span class="viewcode-link">[source]</span></a><a class="headerlink" href="#airflow.contrib.operators.spark_jdbc_operator.SparkJDBCOperator.on_kill" title="Permalink to this definition"></a></dt>
<dd></dd></dl>
</dd></dl>
</div>
</div>
</div>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="../spark_sql_operator/index.html" class="btn btn-neutral float-right" title="airflow.contrib.operators.spark_sql_operator" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
<a href="../sns_publish_operator/index.html" class="btn btn-neutral float-left" title="airflow.contrib.operators.sns_publish_operator" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
</div>
<hr/>
<div role="contentinfo">
<p>
</p>
</div>
Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script type="text/javascript">
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>