blob: 32be77df5bd6da86efe7572d93cf5a97edf735df [file] [log] [blame]
<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="author" content="Apache Software Foundation">
<link rel="shortcut icon" href="../../img/favicon.ico">
<title>Hive Avro-To-Orc Converter - Apache Gobblin</title>
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
<link rel="stylesheet" href="../../css/theme.css" type="text/css" />
<link rel="stylesheet" href="../../css/theme_extra.css" type="text/css" />
<link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css">
<link href="../../css/extra.css" rel="stylesheet">
<script>
// Current page data
var mkdocs_page_name = "Hive Avro-To-Orc Converter";
var mkdocs_page_input_path = "adaptors/Hive-Avro-To-ORC-Converter.md";
var mkdocs_page_url = null;
</script>
<script src="../../js/jquery-2.1.1.min.js" defer></script>
<script src="../../js/modernizr-2.8.3.min.js" defer></script>
<script src="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
<script>hljs.initHighlightingOnLoad();</script>
</head>
<body class="wy-body-for-nav" role="document">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
<div class="wy-side-nav-search">
<a href="../.." class="icon icon-home"> Apache Gobblin</a>
<div role="search">
<form id ="rtd-search-form" class="wy-form" action="../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" title="Type search term here" />
</form>
</div>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<ul class="current">
<li class="toctree-l1">
<a class="" href="/">Home</a>
</li>
<li class="toctree-l1">
<a class="" href="../../Powered-By/">Companies Powered By Gobblin</a>
</li>
<li class="toctree-l1">
<a class="" href="../../Getting-Started/">Getting Started</a>
</li>
<li class="toctree-l1">
<a class="" href="../../Gobblin-Architecture/">Architecture</a>
</li>
<li class="toctree-l1">
<span class="caption-text">User Guide</span>
<ul class="subnav">
<li class="">
<a class="" href="../../user-guide/Working-with-Job-Configuration-Files/">Job Configuration Files</a>
</li>
<li class="">
<a class="" href="../../user-guide/Gobblin-Deployment/">Deployment</a>
</li>
<li class="">
<a class="" href="../../user-guide/Gobblin-as-a-Library/">Gobblin as a Library</a>
</li>
<li class="">
<a class="" href="../../user-guide/Gobblin-CLI/">Gobblin CLI</a>
</li>
<li class="">
<a class="" href="../../user-guide/Gobblin-Compliance/">Gobblin Compliance</a>
</li>
<li class="">
<a class="" href="../../user-guide/Gobblin-on-Yarn/">Gobblin on Yarn</a>
</li>
<li class="">
<a class="" href="../../user-guide/Compaction/">Compaction</a>
</li>
<li class="">
<a class="" href="../../user-guide/State-Management-and-Watermarks/">State Management and Watermarks</a>
</li>
<li class="">
<a class="" href="../../user-guide/Working-with-the-ForkOperator/">Fork Operator</a>
</li>
<li class="">
<a class="" href="../../user-guide/Configuration-Properties-Glossary/">Configuration Glossary</a>
</li>
<li class="">
<a class="" href="../../user-guide/Source-schema-and-Converters/">Source schema and Converters</a>
</li>
<li class="">
<a class="" href="../../user-guide/Partitioned-Writers/">Partitioned Writers</a>
</li>
<li class="">
<a class="" href="../../user-guide/Monitoring/">Monitoring</a>
</li>
<li class="">
<a class="" href="../../user-guide/Gobblin-template/">Template</a>
</li>
<li class="">
<a class="" href="../../user-guide/Gobblin-Schedulers/">Schedulers</a>
</li>
<li class="">
<a class="" href="../../user-guide/Job-Execution-History-Store/">Job Execution History Store</a>
</li>
<li class="">
<a class="" href="../../user-guide/Building-Gobblin/">Building Gobblin</a>
</li>
<li class="">
<a class="" href="../../user-guide/Gobblin-genericLoad/">Generic Configuration Loading</a>
</li>
<li class="">
<a class="" href="../../user-guide/Hive-Registration/">Hive Registration</a>
</li>
<li class="">
<a class="" href="../../user-guide/Config-Management/">Config Management</a>
</li>
<li class="">
<a class="" href="../../user-guide/Docker-Integration/">Docker Integration</a>
</li>
<li class="">
<a class="" href="../../user-guide/Troubleshooting/">Troubleshooting</a>
</li>
<li class="">
<a class="" href="../../user-guide/FAQs/">FAQs</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Sources</span>
<ul class="subnav">
<li class="">
<a class="" href="../../sources/AvroFileSource/">Avro files</a>
</li>
<li class="">
<a class="" href="../../sources/CopySource/">File copy</a>
</li>
<li class="">
<a class="" href="../../sources/QueryBasedSource/">Query based</a>
</li>
<li class="">
<a class="" href="../../sources/RestApiSource/">Rest Api</a>
</li>
<li class="">
<a class="" href="../../sources/GoogleAnalyticsSource/">Google Analytics</a>
</li>
<li class="">
<a class="" href="../../sources/GoogleDriveSource/">Google Drive</a>
</li>
<li class="">
<a class="" href="../../sources/GoogleWebmaster/">Google Webmaster</a>
</li>
<li class="">
<a class="" href="../../sources/HadoopTextInputSource/">Hadoop Text Input</a>
</li>
<li class="">
<a class="" href="../../sources/HelloWorldSource/">Hello World</a>
</li>
<li class="">
<a class="" href="../../sources/HiveAvroToOrcSource/">Hive Avro-to-ORC</a>
</li>
<li class="">
<a class="" href="../../sources/HivePurgerSource/">Hive compliance purging</a>
</li>
<li class="">
<a class="" href="../../sources/SimpleJsonSource/">JSON</a>
</li>
<li class="">
<a class="" href="../../sources/KafkaSource/">Kafka</a>
</li>
<li class="">
<a class="" href="../../sources/MySQLSource/">MySQL</a>
</li>
<li class="">
<a class="" href="../../sources/OracleSource/">Oracle</a>
</li>
<li class="">
<a class="" href="../../sources/SalesforceSource/">Salesforce</a>
</li>
<li class="">
<a class="" href="../../sources/SftpSource/">SFTP</a>
</li>
<li class="">
<a class="" href="../../sources/SqlServerSource/">SQL Server</a>
</li>
<li class="">
<a class="" href="../../sources/TeradataSource/">Teradata</a>
</li>
<li class="">
<a class="" href="../../sources/WikipediaSource/">Wikipedia</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Sinks (Writers)</span>
<ul class="subnav">
<li class="">
<a class="" href="../../sinks/AvroHdfsDataWriter/">Avro HDFS</a>
</li>
<li class="">
<a class="" href="../../sinks/ParquetHdfsDataWriter/">Parquet HDFS</a>
</li>
<li class="">
<a class="" href="../../sinks/SimpleBytesWriter/">HDFS Byte array</a>
</li>
<li class="">
<a class="" href="../../sinks/ConsoleWriter/">Console</a>
</li>
<li class="">
<a class="" href="../../sinks/CouchbaseWriter/">Couchbase</a>
</li>
<li class="">
<a class="" href="../../sinks/Http/">HTTP</a>
</li>
<li class="">
<a class="" href="../../sinks/Gobblin-JDBC-Writer/">JDBC</a>
</li>
<li class="">
<a class="" href="../../sinks/Kafka/">Kafka</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Gobblin Adaptors</span>
<ul class="subnav">
<li class="">
<a class="" href="../Gobblin-Distcp/">Gobblin Distcp</a>
</li>
<li class=" current">
<a class="current" href="./">Hive Avro-To-Orc Converter</a>
<ul class="subnav">
<li class="toctree-l3"><a href="#table-of-contents">Table of Contents</a></li>
<li class="toctree-l3"><a href="#getting-started">Getting Started</a></li>
<li class="toctree-l3"><a href="#job-constructs">Job Constructs</a></li>
<ul>
<li><a class="toctree-l4" href="#source-and-extractor">Source and Extractor</a></li>
<li><a class="toctree-l4" href="#converter">Converter</a></li>
<li><a class="toctree-l4" href="#writer">Writer</a></li>
<li><a class="toctree-l4" href="#publisher">Publisher</a></li>
</ul>
<li class="toctree-l3"><a href="#job-config-properties">Job Config Properties</a></li>
<li class="toctree-l3"><a href="#metrics-and-events">Metrics and Events</a></li>
<li class="toctree-l3"><a href="#sample-job">Sample Job</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Case Studies</span>
<ul class="subnav">
<li class="">
<a class="" href="../../case-studies/Kafka-HDFS-Ingestion/">Kafka-HDFS Ingestion</a>
</li>
<li class="">
<a class="" href="../../case-studies/Publishing-Data-to-S3/">Publishing Data to S3</a>
</li>
<li class="">
<a class="" href="../../case-studies/Writing-ORC-Data/">Writing ORC Data</a>
</li>
<li class="">
<a class="" href="../../case-studies/Hive-Distcp/">Hive Distcp</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Gobblin Data Management</span>
<ul class="subnav">
<li class="">
<a class="" href="../../data-management/Gobblin-Retention/">Retention</a>
</li>
<li class="">
<a class="" href="../../data-management/DistcpNgEvents/">Distcp-NG events</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Gobblin Metrics</span>
<ul class="subnav">
<li class="">
<a class="" href="../../metrics/Gobblin-Metrics/">Quick Start</a>
</li>
<li class="">
<a class="" href="../../metrics/Existing-Reporters/">Existing Reporters</a>
</li>
<li class="">
<a class="" href="../../metrics/Metrics-for-Gobblin-ETL/">Metrics for Gobblin ETL</a>
</li>
<li class="">
<a class="" href="../../metrics/Gobblin-Metrics-Architecture/">Gobblin Metrics Architecture</a>
</li>
<li class="">
<a class="" href="../../metrics/Implementing-New-Reporters/">Implementing New Reporters</a>
</li>
<li class="">
<a class="" href="../../metrics/Gobblin-Metrics-Performance/">Gobblin Metrics Performance</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Developer Guide</span>
<ul class="subnav">
<li class="">
<a class="" href="../../developer-guide/Customization-for-New-Source/">Customization for New Source</a>
</li>
<li class="">
<a class="" href="../../developer-guide/Customization-for-Converter-and-Operator/">Customization for Converter and Operator</a>
</li>
<li class="">
<a class="" href="../../developer-guide/CodingStyle/">Code Style Guide</a>
</li>
<li class="">
<a class="" href="../../developer-guide/Gobblin-Compliance-Design/">Gobblin Compliance Design</a>
</li>
<li class="">
<a class="" href="../../developer-guide/IDE-setup/">IDE setup</a>
</li>
<li class="">
<a class="" href="../../developer-guide/Monitoring-Design/">Monitoring Design</a>
</li>
<li class="">
<a class="" href="../../developer-guide/Documentation-Architecture/">Documentation Architecture</a>
</li>
<li class="">
<a class="" href="../../developer-guide/Contributing/">Contributing</a>
</li>
<li class="">
<a class="" href="../../developer-guide/GobblinModules/">Gobblin Modules</a>
</li>
<li class="">
<a class="" href="../../developer-guide/HighLevelConsumer/">High Level Consumer</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Project</span>
<ul class="subnav">
<li class="">
<a class="" href="../../project/Feature-List/">Feature List</a>
</li>
<li class="">
<a class="" href="/people">Contributors and Team</a>
</li>
<li class="">
<a class="" href="../../project/Talks-and-Tech-Blogs/">Talks and Tech Blog Posts</a>
</li>
<li class="">
<a class="" href="../../project/Posts/">Posts</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Miscellaneous</span>
<ul class="subnav">
<li class="">
<a class="" href="../../miscellaneous/Camus-to-Gobblin-Migration/">Camus to Gobblin Migration</a>
</li>
<li class="">
<a class="" href="../../miscellaneous/Exactly-Once-Support/">Exactly Once Support</a>
</li>
</ul>
</li>
</ul>
</div>
&nbsp;
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../..">Apache Gobblin</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="../..">Docs</a> &raquo;</li>
<li>Gobblin Adaptors &raquo;</li>
<li>Hive Avro-To-Orc Converter</li>
<li class="wy-breadcrumbs-aside">
<a href="https://github.com/apache/incubator-gobblin/edit/master/docs/adaptors/Hive-Avro-To-ORC-Converter.md" rel="nofollow"> Edit on Gobblin</a>
</li>
</ul>
<hr/>
</div>
<div role="main">
<div class="section">
<h2 id="table-of-contents">Table of Contents</h2>
<div class="toc">
<ul>
<li><a href="#table-of-contents">Table of Contents</a></li>
<li><a href="#getting-started">Getting Started</a></li>
<li><a href="#job-constructs">Job Constructs</a><ul>
<li><a href="#source-and-extractor">Source and Extractor</a></li>
<li><a href="#converter">Converter</a></li>
<li><a href="#writer">Writer</a></li>
<li><a href="#publisher">Publisher</a></li>
</ul>
</li>
<li><a href="#job-config-properties">Job Config Properties</a></li>
<li><a href="#metrics-and-events">Metrics and Events</a></li>
<li><a href="#sample-job">Sample Job</a></li>
</ul>
</div>
<h1 id="getting-started">Getting Started</h1>
<p>Gobblin provides ready to use adapters for converting data in <a href="http://avro.apache.org/">Avro</a> to <a href="https://orc.apache.org/">ORC</a>. This page describes the steps to setup such a job.</p>
<p><b>Note: The job requires Avro data to be registered in Hive.</b></p>
<ul>
<li>Gobblin Avro to ORC job leverages <a href="http://hive.apache.org/">Hive</a> for the conversion. Meaning, Gobblin does not read the Avro data record by record and convert each one of them to ORC, instead Gobblin executes hive queries to perform the conversion. This means that Avro data MUST be registred in hive for the converison to be possible. Below is a sample query.</li>
</ul>
<p><b>Example Conversion DDL</b></p>
<pre><code>INSERT OVERWRITE TABLE db_name_orc.table_orc
PARTITION (year='2016')
SELECT
header.id,
header.time,
... (more columns to select)
...
...
FROM db_name_avro.table_avro WHERE year='2016';
</code></pre>
<ul>
<li>Since Hive takes care of scaling the number of mappers/reducers required to perform the conversion, Gobblin does not run this job in MR mode. It runs in standalone mode.</li>
<li>Each workunit converts a hive partition or a hive table (non partitioned tables).</li>
<li>Each workunit/task executes one or more Hive DDLs.</li>
<li>A gobblin task publishes data to a staging table first. The publisher moves data into the final table.</li>
<li>The job supports schema evolution. Meaning any schema (compatible) changes on the Avro table are automatically made on the ORC table.</li>
<li>By default publishing happens per dataset (dataset = table in this context). If a dataset fails, other datasets will still be published but the job will fail. The commit policy is configurable.</li>
<li>Gobblin metrics is used to emit events when ORC data is published or when publish fails.</li>
</ul>
<h1 id="job-constructs">Job Constructs</h1>
<h2 id="source-and-extractor">Source and Extractor</h2>
<p>Gobblin provides <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/conversion/hive/source/HiveSource.java" rel="nofollow"><code>HiveSource</code></a> which is a generic source that connects to the hive metastore and creates <code>WorkUnits</code> for any Hive <code>Partitions</code> and <code>Tables</code> whitelisted. The <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/conversion/hive/extractor/HiveConvertExtractor.java" rel="nofollow"><code>HiveConvertExtractor</code></a> is a Gobblin <code>Extractor</code> to extracts work for Avro to ORC conversion.</p>
<p>The <code>HiveSource</code> uses the <code>HiveDatasetFinder</code> to find all hive tables and partitions that satisfy a whitelist. For each table/partition it creates a workunit is the <code>updateTime</code> is greater than the <code>lowWatermark</code>. By default a <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/conversion/hive/watermarker/PartitionLevelWatermarker.java" rel="nofollow"><code>PartitionLevelWatermarker</code></a> is used. This watermarker tracks watermarks for every partition of the table. Gobblin also provides a <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/conversion/hive/watermarker/TableLevelWatermarker.java" rel="nofollow"><code>TableLevelWatermarker</code></a> that keeps one watermark per table.</p>
<p>The <code>HiveConvertExtractor</code> builds <code>QueryBasedHiveConversionEntity</code>s. The extractor makes necessary calls to the Hive Metastore to get table/partition metadata. The metadata is then wrapped into a <code>QueryBasedHiveConversionEntity</code>.</p>
<h2 id="converter">Converter</h2>
<p>The converter builds the Hive DDLs/DMLs required to perform the Avro to ORC conversion. Gobblin supports conversion of Avro to both flattened ORC and nested ORC.
The abstract converter <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/conversion/hive/converter/AbstractAvroToOrcConverter.java" rel="nofollow"><code>AbstractAvroToOrcConverter</code></a> builds DDLs/DMLs for any destination ORC format. Concrete subclass <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/conversion/hive/converter/HiveAvroToFlattenedOrcConverter.java" rel="nofollow"><code>HiveAvroToFlattenedOrcConverter</code></a> provides the configurations required for Avro to flattened ORC conversion and <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/conversion/hive/converter/HiveAvroToNestedOrcConverter.java" rel="nofollow"><code>HiveAvroToNestedOrcConverter</code></a> provides the configurations required for Avro to nested ORC conversion. In the job configurations, both converters can be chained to perform flattend and nested ORC conversion in the same job. Each converter can also be used independent of other.</p>
<p>The converter builds the following different DDLs/DMLs</p>
<ul>
<li>Create staging table DDL - ORC data is written to a staging table first. The publisher then publishes them to the final ORC table. These DDLs are to create the staging table. A staging table looks like <code>&lt;orc_db_name&gt;.&lt;orc_table_name&gt;_staging_&lt;timestamp&gt;</code></li>
<li>Create staging partition DDL - Similar to staging table but for a partition</li>
<li>Conversion staging DML - This is the DML to select rows from Avro source table and insert them into the ORC staging table</li>
<li>Create final table DDL (Optional) - This is the final ORC destination table. Creates the destination table is it does not exist</li>
<li>Evolve final table DDLs (Optional) - Populate the schema evolution queries if required</li>
<li>Drop partitions if exist in final table - DDL to drop a partition on destination if it already exists.</li>
<li>Create final partition DDL - Create the ORC partition</li>
<li>Drop staging table DDL - Cleanup the staging table after data is published from staging to final tables</li>
</ul>
<h2 id="writer">Writer</h2>
<p>The writer in this context executes the Hive DDLs/DMLs generated by the converter. <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/conversion/hive/writer/HiveQueryExecutionWriter.java" rel="nofollow"><code>HiveQueryExecutionWriter</code></a> uses Hive JDBC connector to execute the DDLs. The DDLs write ORC data into staging tables. After the writer has completed <code>HiveQueryExecutionWriter#write()</code>, ORC data will be available in the staging tables.</p>
<h2 id="publisher">Publisher</h2>
<p>The publisher <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-data-management/src/main/java/org/apache/gobblin/data/management/conversion/hive/publisher/HiveConvertPublisher.java" rel="nofollow"><code>HiveConvertPublisher</code></a> executes hive DDLs to publish staging ORC tables to final ORC tables. The publisher also cleans up staging tables.
By default publishing happens per dataset (dataset = table in this context). If a dataset fails, other datasets will still be published but the job will fail. The commit policy is configurable.</p>
<h1 id="job-config-properties">Job Config Properties</h1>
<p>These are some of the job config properties used by <code>HiveAvroToOrcSource</code> and <code>HiveConvertExtractor</code>.</p>
<table style="table-layout: fixed; width: 100%">
<col width="20%">
<col width="40%">
<col width="40%">
<tr>
<th>Configuration key</th>
<th>Description</th>
<th>Example value</th>
</tr>
<tr>
<td style="word-wrap: break-word">
<p>hive.dataset.whitelist</p>
</td>
<td style="word-wrap: break-word">Avro hive databases, tables to be converted</td>
<td style="word-wrap: break-word">
<ol>
<li >db1 -&gt; any table under db1 passes.</li>
<li >db1.table1 -&gt; only db1.table1 passes.</li>
<li>db1.table* -&gt; any table under db1 whose name satisfies the pattern table* passes.</li>
<li>db* -&gt; all tables from all databases whose names satisfy the pattern db* pass.</li>
<li>db*.table* -&gt; db and table must satisfy the patterns db* and table* respectively</li>
<li>db1.table1,db2.table2 -&gt; combine expressions for different databases with comma.</li>
<li>db1.table1|table2 -&gt; combine expressions for same database with &quot;|&quot;.</li>
</ol>
</td>
</tr>
<tr>
<td style="word-wrap: break-word">hive.dataset.blacklist</td>
<td style="word-wrap: break-word">Avro hive databases, tables not to converted</td>
<td style="word-wrap: break-word">
<p >Same as hive.dataset.whitelist examples</p>
</td>
</tr>
<tr>
<td style="word-wrap: break-word">
<p >gobblin.runtime.root.dir</p>
</td>
<td style="word-wrap: break-word">
<p >Root dir for <span class="s1">gobblin</span> state store, staging, output etc.</p>
</td>
<td style="word-wrap: break-word">
<p >/jobs/user/avroToOrc</p>
</td>
</tr>
<tr>
<td style="word-wrap: break-word">
<p >hive.source.maximum.lookbackDays</p>
</td>
<td style="word-wrap: break-word">
<p>Partitions older than this value will not be processed. The default value is set to 3.</p>
<p>So if an Avro partition older than 3 days gets modified, the job will not convert the new changes.</p>
</td>
<td style="word-wrap: break-word">3</td>
</tr>
<tr>
<td style="word-wrap: break-word">
<p >hive.source.watermarker.class</p>
</td>
<td style="word-wrap: break-word">
The type of watermark to use. Watermark can be per partition or per table. The default is
<p ><span class="s1">gobblin.data.management.conversion.hive.watermarker.PartitionLevelWatermarker</span></p>
</td>
<td style="word-wrap: break-word">
<p><span>gobblin.data.management.conversion.hive.watermarker.PartitionLevelWatermarker</span></p>
<p><span><span>gobblin.data.management.conversion.hive.watermarker.TableLevelWatermarker</span></span></p>
</td>
</tr>
<tr>
<td style="word-wrap: break-word">
<p >taskexecutor.threadpool.size</p>
</td>
<td style="word-wrap: break-word">
<p>Maximum number of parallel conversion hive queries to run. This is the standard gobblin property to control the number of parallel tasks (threads).</p>
<p>This is set to a default of 50 because each task queries the hive metastore. So this property also limits the number of parallel metastore connections</p>
</td>
<td style="word-wrap: break-word">50</td>
</tr>
<tr>
</tr>
<tr>
<td style="word-wrap: break-word">
hive.conversion.avro.flattenedOrc.destination.dbName
</td>
<td style="word-wrap: break-word">Name of the ORC database</td>
<td style="word-wrap: break-word">
<p>$DB is the Avro database name.</p>
<p>E.g. If avro database name is tracking, $DB will be resolved at runtime to tracking.</p>
<ul>
<li>Setting the value to &quot;$DB_column&quot; will result in a ORC table name of tracking_column</li>
</ul>
</td>
</tr>
<tr>
<td style="word-wrap: break-word">
hive.conversion.avro.flattenedOrc.destination.tableName
</td>
<td style="word-wrap: break-word"> Name of the ORC table</td>
<td style="word-wrap: break-word">
<p>$TABLE is the Avro table name.</p>
<p>E.g. If avro table name is LogEvent, $TABLE will be resolved at runtime to LogEvent.</p>
<ul>
<li>Setting the value of this property to <span>&quot;$TABLE&quot; will cause the ORC table name to be same as Avro table name.</span></li>
<li><span>Setting the value to <span>&quot;$TABLE_orc&quot; will result in a ORC table name of LogEvent_orc</span></span></li>
</ul>
</td>
</tr>
<tr>
<td style="word-wrap: break-word">
hive.conversion.avro.flattenedOrc.destination.dataPath
</td>
<td style="word-wrap: break-word">Location on HDFS where ORC data is published</td>
<td style="word-wrap: break-word">/events_orc/$DB/$TABLE</td>
</tr>
<tr>
<td style="word-wrap: break-word">
hive.conversion.avro.flattenedOrc.evolution.enabled
</td>
<td style="word-wrap: break-word">Decides if schema evolution is enabled</td>
<td style="word-wrap: break-word">true/false</td>
</tr>
<tr>
<td style="word-wrap: break-word">
hive.conversion.avro.flattenedOrc.hiveRuntime.*
</td>
<td style="word-wrap: break-word">
<p>Additional hive properties to be set while executing the conversion DDL.</p>
<p>Prefix any hive standard properties with this key</p>
</td>
<td style="word-wrap: break-word">
hive.conversion.avro.flattenedOrc.hiveRuntime.mapred.map.tasks=10
</td>
</tr>
<tr>
<td style="word-wrap: break-word">
hive.conversion.avro.destinationFormats
</td>
<td style="word-wrap: break-word">A comma separated list of destination formats. Currently supports nestedOrc and flattenedOrc</td>
<td style="word-wrap: break-word">flattenedOrc,nestedOrc</td>
</tr>
</table>
<h1 id="metrics-and-events">Metrics and Events</h1>
<p>SLA event is published every time an Avro partition/table is converted to ORC. Each SLA event has the following metadata.</p>
<pre><code>{
## Publish timestamp
&quot;timestamp&quot; : &quot;1470229945441&quot;,
&quot;namespace&quot; : &quot;gobblin.hive.conversion&quot;,
&quot;name&quot; : &quot;gobblin.hive.conversion.ConversionSuccessful&quot;,
&quot;metadata&quot; : {
## Azkaban metadata (If running on Azkaban)
&quot;azkabanExecId&quot;: &quot;880060&quot;,
&quot;azkabanFlowId&quot;: &quot;azkaban_flow_name&quot;,
&quot;azkabanJobId&quot;: &quot;azkaban_job_name&quot;,
&quot;azkabanProjectName&quot;: &quot;azkaban_project_name&quot;,
&quot;jobId&quot;: &quot;job_AvroToOrcConversion_1470227416023&quot;,
&quot;jobName&quot;: &quot;AvroToOrcConversion&quot;,
## Dataset and Partition metadata
&quot;datasetUrn&quot;: &quot;events@logevent&quot;,
&quot;sourceDataLocation&quot;: &quot;hdfs://&lt;host&gt;:&lt;port&gt;/events/LogEvent/2016/08/03/04&quot;,
&quot;partition&quot;: &quot;datepartition=2016-08-03-04&quot;,
&quot;schemaEvolutionDDLNum&quot;: &quot;0&quot;,
## Begin and End time metadata for each phase
&quot;beginConversionDDLExecuteTime&quot;: &quot;1470227453370&quot;,
&quot;beginDDLBuildTime&quot;: &quot;1470227452382&quot;,
&quot;beginGetWorkunitsTime&quot;: &quot;1470227428136&quot;,
&quot;beginPublishDDLExecuteTime&quot;: &quot;1470229944141&quot;,
&quot;endConversionDDLExecuteTime&quot;: &quot;1470227928486&quot;,
&quot;endDDLBuildTime&quot;: &quot;1470227452382&quot;,
&quot;endPublishDDLExecuteTime&quot;: &quot;1470229945440&quot;,
&quot;originTimestamp&quot;: &quot;1470227446703&quot;,
&quot;previousPublishTs&quot;: &quot;1470223843230&quot;,
&quot;upstreamTimestamp&quot;: &quot;1470226593984&quot;,
&quot;workunitCreateTime&quot;: &quot;1470227446703&quot;
## Gobblin metrics metadata
&quot;class&quot;: &quot;org.apache.gobblin.data.management.conversion.hive.publisher.HiveConvertPublisher&quot;,
&quot;metricContextID&quot;: &quot;20bfb2a2-0592-4f53-9259-c8ee125f90a8&quot;,
&quot;metricContextName&quot;: &quot;org.apache.gobblin.data.management.conversion.hive.publisher.HiveConvertPublisher.781426901&quot;,
}
}
</code></pre>
<p>The diagram below describes timestamps captured in the SLA event.
<img alt="Event metadata description" src="../../img/Avro-to-Orc-timeline.jpg" /></p>
<h1 id="sample-job">Sample Job</h1>
<pre><code># Avro hive databases and tables to convert
hive.dataset.whitelist=events.LogEvent|LoginEvent
data.publisher.type=org.apache.gobblin.data.management.conversion.hive.publisher.HiveConvertPublisher
source.class=org.apache.gobblin.data.management.conversion.hive.source.HiveAvroToOrcSource
writer.builder.class=org.apache.gobblin.data.management.conversion.hive.writer.HiveQueryWriterBuilder
converter.classes=org.apache.gobblin.data.management.conversion.hive.converter.HiveAvroToFlattenedOrcConverter,org.apache.gobblin.data.management.conversion.hive.converter.HiveAvroToNestedOrcConverter
hive.dataset.finder.class=org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDatasetFinder
# Only flattened orc is enabled
hive.conversion.avro.destinationFormats=flattenedOrc
hive.conversion.avro.flattenedOrc.destination.dataPath=/events_orc/
# Avro table name _orc
hive.conversion.avro.flattenedOrc.destination.tableName=$TABLE_orc
# Same as Avro table name
hive.conversion.avro.flattenedOrc.destination.dbName=$DB
hive.conversion.avro.flattenedOrc.evolution.enabled=true
hive.conversion.avro.flattenedOrc.source.dataPathIdentifier=daily,hourly
# No host and port required. Hive starts an embedded hiveserver2
hiveserver.connection.string=jdbc:hive2://
## Maximum lookback
hive.source.maximum.lookbackDays=3
## Gobblin standard properties ##
task.maxretries=1
taskexecutor.threadpool.size=75
workunit.retry.enabled=true
# Gobblin framework locations
mr.job.root.dir=/jobs/working
state.store.dir=/jobs/state_store
writer.staging.dir=/jobs/writer_staging
writer.output.dir=/jobs/writer_output
# Metrics
metrics.enabled=true
metrics.reporting.kafka.enabled=true
metrics.reporting.kafka.format=avro
metrics.reporting.kafka.avro.use.schema.registry=true
metrics.reporting.kafka.topic.metrics=MetricReport
launcher.type=LOCAL
classpath=lib/*
</code></pre>
</div>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="../../case-studies/Kafka-HDFS-Ingestion/" class="btn btn-neutral float-right" title="Kafka-HDFS Ingestion">Next <span class="icon icon-circle-arrow-right"></span></a>
<a href="../Gobblin-Distcp/" class="btn btn-neutral" title="Gobblin Distcp"><span class="icon icon-circle-arrow-left"></span> Previous</a>
</div>
<hr/>
<div role="contentinfo">
<!-- Copyright etc -->
</div>
Built with <a href="http://www.mkdocs.org" rel="nofollow">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme" rel="nofollow">theme</a> provided by <a href="https://readthedocs.org" rel="nofollow">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<div class="rst-versions" role="note" style="cursor: pointer">
<span class="rst-current-version" data-toggle="rst-current-version">
<span><a href="../Gobblin-Distcp/" style="color: #fcfcfc;">&laquo; Previous</a></span>
<span style="margin-left: 15px"><a href="../../case-studies/Kafka-HDFS-Ingestion/" style="color: #fcfcfc">Next &raquo;</a></span>
</span>
</div>
<script>var base_url = '../..';</script>
<script src="../../js/theme.js" defer></script>
<script src="../../js/extra.js" defer></script>
<script src="../../search/main.js" defer></script>
</body>
</html>