blob: 761b16322e187f83322f25031156956d394be648 [file] [log] [blame]
<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="author" content="Apache Software Foundation">
<link rel="shortcut icon" href="../img/favicon.ico">
<title>Getting Started - Apache Gobblin</title>
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
<link rel="stylesheet" href="../css/theme.css" type="text/css" />
<link rel="stylesheet" href="../css/theme_extra.css" type="text/css" />
<link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css">
<link href="../css/extra.css" rel="stylesheet">
<script>
// Current page data
var mkdocs_page_name = "Getting Started";
var mkdocs_page_input_path = "Getting-Started.md";
var mkdocs_page_url = null;
</script>
<script src="../js/jquery-2.1.1.min.js" defer></script>
<script src="../js/modernizr-2.8.3.min.js" defer></script>
<script src="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
<script>hljs.initHighlightingOnLoad();</script>
</head>
<body class="wy-body-for-nav" role="document">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
<div class="wy-side-nav-search">
<a href=".." class="icon icon-home"> Apache Gobblin</a>
<div role="search">
<form id ="rtd-search-form" class="wy-form" action="../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" title="Type search term here" />
</form>
</div>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<ul class="current">
<li class="toctree-l1">
<a class="" href="/">Home</a>
</li>
<li class="toctree-l1">
<a class="" href="../Powered-By/">Companies Powered By Gobblin</a>
</li>
<li class="toctree-l1 current">
<a class="current" href="./">Getting Started</a>
<ul class="subnav">
<li class="toctree-l2"><a href="#table-of-contents">Table of Contents</a></li>
<li class="toctree-l2"><a href="#introduction">Introduction</a></li>
<li class="toctree-l2"><a href="#getting-a-gobblin-release">Getting a Gobblin Release</a></li>
<ul>
<li><a class="toctree-l3" href="#building-a-distribution">Building a Distribution</a></li>
</ul>
<li class="toctree-l2"><a href="#run-your-first-job">Run Your First Job</a></li>
<ul>
<li><a class="toctree-l3" href="#steps">Steps</a></li>
</ul>
<li class="toctree-l2"><a href="#running-gobblin-as-a-daemon">Running Gobblin as a Daemon</a></li>
<ul>
<li><a class="toctree-l3" href="#preliminary">Preliminary</a></li>
<li><a class="toctree-l3" href="#steps_1">Steps</a></li>
</ul>
<li class="toctree-l2"><a href="#other-example-jobs">Other Example Jobs</a></li>
</ul>
</li>
<li class="toctree-l1">
<a class="" href="../Gobblin-Architecture/">Architecture</a>
</li>
<li class="toctree-l1">
<span class="caption-text">User Guide</span>
<ul class="subnav">
<li class="">
<a class="" href="../user-guide/Working-with-Job-Configuration-Files/">Job Configuration Files</a>
</li>
<li class="">
<a class="" href="../user-guide/Gobblin-Deployment/">Deployment</a>
</li>
<li class="">
<a class="" href="../user-guide/Gobblin-as-a-Library/">Gobblin as a Library</a>
</li>
<li class="">
<a class="" href="../user-guide/Gobblin-CLI/">Gobblin CLI</a>
</li>
<li class="">
<a class="" href="../user-guide/Gobblin-Compliance/">Gobblin Compliance</a>
</li>
<li class="">
<a class="" href="../user-guide/Gobblin-on-Yarn/">Gobblin on Yarn</a>
</li>
<li class="">
<a class="" href="../user-guide/Compaction/">Compaction</a>
</li>
<li class="">
<a class="" href="../user-guide/State-Management-and-Watermarks/">State Management and Watermarks</a>
</li>
<li class="">
<a class="" href="../user-guide/Working-with-the-ForkOperator/">Fork Operator</a>
</li>
<li class="">
<a class="" href="../user-guide/Configuration-Properties-Glossary/">Configuration Glossary</a>
</li>
<li class="">
<a class="" href="../user-guide/Source-schema-and-Converters/">Source schema and Converters</a>
</li>
<li class="">
<a class="" href="../user-guide/Partitioned-Writers/">Partitioned Writers</a>
</li>
<li class="">
<a class="" href="../user-guide/Monitoring/">Monitoring</a>
</li>
<li class="">
<a class="" href="../user-guide/Gobblin-template/">Template</a>
</li>
<li class="">
<a class="" href="../user-guide/Gobblin-Schedulers/">Schedulers</a>
</li>
<li class="">
<a class="" href="../user-guide/Job-Execution-History-Store/">Job Execution History Store</a>
</li>
<li class="">
<a class="" href="../user-guide/Building-Gobblin/">Building Gobblin</a>
</li>
<li class="">
<a class="" href="../user-guide/Gobblin-genericLoad/">Generic Configuration Loading</a>
</li>
<li class="">
<a class="" href="../user-guide/Hive-Registration/">Hive Registration</a>
</li>
<li class="">
<a class="" href="../user-guide/Config-Management/">Config Management</a>
</li>
<li class="">
<a class="" href="../user-guide/Docker-Integration/">Docker Integration</a>
</li>
<li class="">
<a class="" href="../user-guide/Troubleshooting/">Troubleshooting</a>
</li>
<li class="">
<a class="" href="../user-guide/FAQs/">FAQs</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Sources</span>
<ul class="subnav">
<li class="">
<a class="" href="../sources/AvroFileSource/">Avro files</a>
</li>
<li class="">
<a class="" href="../sources/CopySource/">File copy</a>
</li>
<li class="">
<a class="" href="../sources/QueryBasedSource/">Query based</a>
</li>
<li class="">
<a class="" href="../sources/RestApiSource/">Rest Api</a>
</li>
<li class="">
<a class="" href="../sources/GoogleAnalyticsSource/">Google Analytics</a>
</li>
<li class="">
<a class="" href="../sources/GoogleDriveSource/">Google Drive</a>
</li>
<li class="">
<a class="" href="../sources/GoogleWebmaster/">Google Webmaster</a>
</li>
<li class="">
<a class="" href="../sources/HadoopTextInputSource/">Hadoop Text Input</a>
</li>
<li class="">
<a class="" href="../sources/HelloWorldSource/">Hello World</a>
</li>
<li class="">
<a class="" href="../sources/HiveAvroToOrcSource/">Hive Avro-to-ORC</a>
</li>
<li class="">
<a class="" href="../sources/HivePurgerSource/">Hive compliance purging</a>
</li>
<li class="">
<a class="" href="../sources/SimpleJsonSource/">JSON</a>
</li>
<li class="">
<a class="" href="../sources/KafkaSource/">Kafka</a>
</li>
<li class="">
<a class="" href="../sources/MySQLSource/">MySQL</a>
</li>
<li class="">
<a class="" href="../sources/OracleSource/">Oracle</a>
</li>
<li class="">
<a class="" href="../sources/SalesforceSource/">Salesforce</a>
</li>
<li class="">
<a class="" href="../sources/SftpSource/">SFTP</a>
</li>
<li class="">
<a class="" href="../sources/SqlServerSource/">SQL Server</a>
</li>
<li class="">
<a class="" href="../sources/TeradataSource/">Teradata</a>
</li>
<li class="">
<a class="" href="../sources/WikipediaSource/">Wikipedia</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Sinks (Writers)</span>
<ul class="subnav">
<li class="">
<a class="" href="../sinks/AvroHdfsDataWriter/">Avro HDFS</a>
</li>
<li class="">
<a class="" href="../sinks/ParquetHdfsDataWriter/">Parquet HDFS</a>
</li>
<li class="">
<a class="" href="../sinks/SimpleBytesWriter/">HDFS Byte array</a>
</li>
<li class="">
<a class="" href="../sinks/ConsoleWriter/">Console</a>
</li>
<li class="">
<a class="" href="../sinks/CouchbaseWriter/">Couchbase</a>
</li>
<li class="">
<a class="" href="../sinks/Http/">HTTP</a>
</li>
<li class="">
<a class="" href="../sinks/Gobblin-JDBC-Writer/">JDBC</a>
</li>
<li class="">
<a class="" href="../sinks/Kafka/">Kafka</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Gobblin Adaptors</span>
<ul class="subnav">
<li class="">
<a class="" href="../adaptors/Gobblin-Distcp/">Gobblin Distcp</a>
</li>
<li class="">
<a class="" href="../adaptors/Hive-Avro-To-ORC-Converter/">Hive Avro-To-Orc Converter</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Case Studies</span>
<ul class="subnav">
<li class="">
<a class="" href="../case-studies/Kafka-HDFS-Ingestion/">Kafka-HDFS Ingestion</a>
</li>
<li class="">
<a class="" href="../case-studies/Publishing-Data-to-S3/">Publishing Data to S3</a>
</li>
<li class="">
<a class="" href="../case-studies/Writing-ORC-Data/">Writing ORC Data</a>
</li>
<li class="">
<a class="" href="../case-studies/Hive-Distcp/">Hive Distcp</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Gobblin Data Management</span>
<ul class="subnav">
<li class="">
<a class="" href="../data-management/Gobblin-Retention/">Retention</a>
</li>
<li class="">
<a class="" href="../data-management/DistcpNgEvents/">Distcp-NG events</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Gobblin Metrics</span>
<ul class="subnav">
<li class="">
<a class="" href="../metrics/Gobblin-Metrics/">Quick Start</a>
</li>
<li class="">
<a class="" href="../metrics/Existing-Reporters/">Existing Reporters</a>
</li>
<li class="">
<a class="" href="../metrics/Metrics-for-Gobblin-ETL/">Metrics for Gobblin ETL</a>
</li>
<li class="">
<a class="" href="../metrics/Gobblin-Metrics-Architecture/">Gobblin Metrics Architecture</a>
</li>
<li class="">
<a class="" href="../metrics/Implementing-New-Reporters/">Implementing New Reporters</a>
</li>
<li class="">
<a class="" href="../metrics/Gobblin-Metrics-Performance/">Gobblin Metrics Performance</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Developer Guide</span>
<ul class="subnav">
<li class="">
<a class="" href="../developer-guide/Customization-for-New-Source/">Customization for New Source</a>
</li>
<li class="">
<a class="" href="../developer-guide/Customization-for-Converter-and-Operator/">Customization for Converter and Operator</a>
</li>
<li class="">
<a class="" href="../developer-guide/CodingStyle/">Code Style Guide</a>
</li>
<li class="">
<a class="" href="../developer-guide/Gobblin-Compliance-Design/">Gobblin Compliance Design</a>
</li>
<li class="">
<a class="" href="../developer-guide/IDE-setup/">IDE setup</a>
</li>
<li class="">
<a class="" href="../developer-guide/Monitoring-Design/">Monitoring Design</a>
</li>
<li class="">
<a class="" href="../developer-guide/Documentation-Architecture/">Documentation Architecture</a>
</li>
<li class="">
<a class="" href="../developer-guide/Contributing/">Contributing</a>
</li>
<li class="">
<a class="" href="../developer-guide/GobblinModules/">Gobblin Modules</a>
</li>
<li class="">
<a class="" href="../developer-guide/HighLevelConsumer/">High Level Consumer</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Project</span>
<ul class="subnav">
<li class="">
<a class="" href="../project/Feature-List/">Feature List</a>
</li>
<li class="">
<a class="" href="/people">Contributors and Team</a>
</li>
<li class="">
<a class="" href="../project/Talks-and-Tech-Blogs/">Talks and Tech Blog Posts</a>
</li>
<li class="">
<a class="" href="../project/Posts/">Posts</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Miscellaneous</span>
<ul class="subnav">
<li class="">
<a class="" href="../miscellaneous/Camus-to-Gobblin-Migration/">Camus to Gobblin Migration</a>
</li>
<li class="">
<a class="" href="../miscellaneous/Exactly-Once-Support/">Exactly Once Support</a>
</li>
</ul>
</li>
</ul>
</div>
&nbsp;
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="..">Apache Gobblin</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="..">Docs</a> &raquo;</li>
<li>Getting Started</li>
<li class="wy-breadcrumbs-aside">
<a href="https://github.com/apache/incubator-gobblin/edit/master/docs/Getting-Started.md" rel="nofollow"> Edit on Gobblin</a>
</li>
</ul>
<hr/>
</div>
<div role="main">
<div class="section">
<h2 id="table-of-contents">Table of Contents</h2>
<div class="toc">
<ul>
<li><a href="#table-of-contents">Table of Contents</a></li>
<li><a href="#introduction">Introduction</a></li>
<li><a href="#getting-a-gobblin-release">Getting a Gobblin Release</a><ul>
<li><a href="#building-a-distribution">Building a Distribution</a></li>
</ul>
</li>
<li><a href="#run-your-first-job">Run Your First Job</a><ul>
<li><a href="#steps">Steps</a></li>
</ul>
</li>
<li><a href="#running-gobblin-as-a-daemon">Running Gobblin as a Daemon</a><ul>
<li><a href="#preliminary">Preliminary</a></li>
<li><a href="#steps_1">Steps</a></li>
</ul>
</li>
<li><a href="#other-example-jobs">Other Example Jobs</a></li>
</ul>
</div>
<h1 id="introduction">Introduction</h1>
<p>This guide will help you setup Gobblin, and run your first job. Currently, Gobblin requires JDK 7 or later to run.</p>
<h1 id="getting-a-gobblin-release">Getting a Gobblin Release</h1>
<p>All steps in this page assume you are using a Apache Gobblin source distribution. </p>
<p>Download source distribution from <a href="/download/">here</a>. </p>
<h2 id="building-a-distribution">Building a Distribution</h2>
<p>Build a distribution:</p>
<pre><code class="bash">cd /path/to/gobblin/source
./gradlew :gobblin-distribution:buildDistributionTar
</code></pre>
<p>Note: A full build takes time because it runs other tasks like test, javadoc, findMainBugs, etc, which impacts the build performance.
For a quick usage, building distribution is good enough. However a full build can be easily made by running:</p>
<pre><code class="bash">./gradlew build
</code></pre>
<p>The generated distribution contains the binary in a specific directory structure, which is different from source.</p>
<p>After the build is done, there should be a tarball (if there are multiple, use the newest one) at </p>
<p><code>build/gobblin-distribution/distributions/</code></p>
<p>Distributions built from source are generated as <code>*.tar.gz</code> files. After getting the tarball, unpackage it locally:</p>
<p><code>tar -xvf gobblin-distribution-[VERSION].tar.gz</code>. </p>
<h1 id="run-your-first-job">Run Your First Job</h1>
<p>Note: the following two sections are only applicable to newer versions of Gobblin. If you are running version 0.8.0 or earlier, skip to <a href="#running-gobblin-as-a-daemon">Gobblin daemon</a>.</p>
<p>Here we illustrate how to run a simple job. This job will pull revisions for the last ten days of each of the two Wikipedia pages: Linkedin, Wikipedia:Sandbox (a page with frequent edits). The records will be written to stdout.</p>
<p>Gobblin can run either in standalone mode or on MapReduce. In this example we will run Gobblin in standalone mode.</p>
<p>This page explains how to run the job from the terminal. You may also run this job from your favorite IDE (IntelliJ is recommended).</p>
<h2 id="steps">Steps</h2>
<ul>
<li>cd to the unpacked Gobblin distribution and run <code>bin/gobblin cli run</code> to get usage.</li>
<li>Running <code>bin/gobblin cli run listQuickApps</code> will list the available easy-to-configure apps. Note the line with the wikipedia example:</li>
</ul>
<pre><code class="bash">wikipedia - Gobblin example that downloads revisions from Wikipedia.
</code></pre>
<ul>
<li>Running <code>bin/gobblin cli run wikipedia</code> will show the usage of this application. Notice the usage and one of the options listed for this job:</li>
</ul>
<pre><code class="bash">usage: gobblin cli run wikipedia [OPTIONS] &lt;article-title&gt; [&lt;article-title&gt;...]
-lookback &lt;arg&gt; Sets the period for which articles should be
pulled in ISO time format (e.g. P2D, PT1H)
</code></pre>
<ul>
<li>Run <code>bin/gobblin cli run wikipedia -lookback P10D LinkedIn Wikipedia:Sandbox</code>. This will print a lot of logs, but somewhere in there you will see a few json entries with the revisions for those articles. For example:</li>
</ul>
<pre><code class="bash">{&quot;revid&quot;:746260034,&quot;parentid&quot;:745444076,&quot;user&quot;:&quot;2605:8D80:580:5824:B108:82BD:693D:CFA1&quot;,&quot;anon&quot;:&quot;&quot;,&quot;userid&quot;:0,&quot;timestamp&quot;:&quot;2016-10-26T08:12:09Z&quot;,&quot;size&quot;:69527,&quot;pageid&quot;:970755,&quot;title&quot;:&quot;LinkedIn&quot;}
</code></pre>
<ul>
<li>In the usage, there is also an option to instead write the output to an avro file:</li>
</ul>
<pre><code class="bash"> -avroOutput &lt;arg&gt; Write output to Avro files. Specify the
output directory as argument.
</code></pre>
<p>Running <code>bin/gobblin cli run wikipedia -lookback P10D -avroOutput /tmp/wikiSample LinkedIn Wikipedia:Sandbox</code> will create a directory <code>/tmp/wikiSample</code> with two subdirectories <code>LinkedIn</code> and <code>Wikipedia_Sandbox</code> each one with one avro file.</p>
<h1 id="running-gobblin-as-a-daemon">Running Gobblin as a Daemon</h1>
<p>Here we show how to run a Gobblin daemon. A Gobblin daemon tracks a directory and finds job configuration files in it (jobs with extensions <code>*.pull</code>). Job files can be either run once or scheduled jobs. Gobblin will automatically execute this jobs as they are received following the schedule.</p>
<p>For this example, we will once again run the Wikipedia example. The records will be stored as Avro files.</p>
<h2 id="preliminary">Preliminary</h2>
<p>Each Gobblin job minimally involves several constructs, e.g. <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/source/Source.java" rel="nofollow">Source</a>, <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/source/extractor/Extractor.java" rel="nofollow">Extractor</a>, <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/writer/DataWriter.java" rel="nofollow">DataWriter</a> and <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/publisher/DataPublisher.java" rel="nofollow">DataPublisher</a>. As the names suggest, Source defines the source to pull data from, Extractor implements the logic to extract data records, DataWriter defines the way the extracted records are output, and DataPublisher publishes the data to the final output location. A job may optionally have one or more Converters, which transform the extracted records, as well as one or more PolicyCheckers that check the quality of the extracted records and determine whether they conform to certain policies.</p>
<p>Some of the classes relevant to this example include <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-example/src/main/java/org/apache/gobblin/example/wikipedia/WikipediaSource.java" rel="nofollow">WikipediaSource</a>, <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-example/src/main/java/org/apache/gobblin/example/wikipedia/WikipediaExtractor.java" rel="nofollow">WikipediaExtractor</a>, <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-example/src/main/java/org/apache/gobblin/example/wikipedia/WikipediaConverter.java" rel="nofollow">WikipediaConverter</a>, <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/writer/AvroHdfsDataWriter.java" rel="nofollow">AvroHdfsDataWriter</a> and <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/publisher/BaseDataPublisher.java" rel="nofollow">BaseDataPublisher</a>.</p>
<p>To run Gobblin in standalone daemon mode we need a Gobblin configuration file (such as uses <a href="https://github.com/apache/incubator-gobblin/blob/master/conf/standalone/application.conf" rel="nofollow">application.conf</a>). And for each job we wish to run, we also need a job configuration file (such as <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-example/src/main/resources/wikipedia.pull" rel="nofollow">wikipedia.pull</a>). The Gobblin configuration file, which is passed to Gobblin as a command line argument, should contain a property <code>jobconf.dir</code> which specifies where the job configuration files are located. By default, <code>jobconf.dir</code> points to environment variable <code>GOBBLIN_JOB_CONFIG_DIR</code>. Each file in <code>jobconf.dir</code> with extension <code>.job</code> or <code>.pull</code> is considered a job configuration file, and Gobblin will launch a job for each such file. For more information on Gobblin deployment in standalone mode, refer to the <a href="user-guide/Gobblin-Deployment#Standalone-Deployment">Standalone Deployment</a> page.</p>
<p>A list of commonly used configuration properties can be found here: <a href="user-guide/Configuration-Properties-Glossary">Configuration Properties Glossary</a>.</p>
<h2 id="steps_1">Steps</h2>
<ul>
<li>
<p>Create a folder to store the job configuration file. Put <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-example/src/main/resources/wikipedia.pull" rel="nofollow">wikipedia.pull</a> in this folder, and set environment variable <code>GOBBLIN_JOB_CONFIG_DIR</code> to point to this folder. Also, make sure that the environment variable <code>JAVA_HOME</code> is set correctly.</p>
</li>
<li>
<p>Create a folder as Gobblin's working directory. Gobblin will write job output as well as other information there, such as locks and state-store (for more information, see the <a href="user-guide/Gobblin-Deployment#Standalone-Deployment">Standalone Deployment</a> page). Set environment variable <code>GOBBLIN_WORK_DIR</code> to point to that folder.</p>
</li>
<li>
<p>Unpack Gobblin distribution:</p>
</li>
<li>
<p>Launch Gobblin in one of the execution mode [for more info refer: <a href="/gobblin-docs/user-guide/Gobblin-CLI.md">Gobblin-CLI</a>] :</p>
</li>
</ul>
<pre><code class="bash">gobblin service standalone start
</code></pre>
<p>Stdout and the job log, which contains the progress and status of the job, will be written into <code>logs/&lt;execution-mode&gt;.out</code> &amp; <code>logs/&lt;execution-mode&gt;.err</code> (to change where the log is written, modify the Log4j configuration file <code>conf/log4j.xml</code>).</p>
<p>Among the job logs there should be the following information:</p>
<pre><code>INFO JobScheduler - Loaded 1 job configuration
INFO AbstractJobLauncher - Starting job job_PullFromWikipedia_1422040355678
INFO TaskExecutor - Starting the task executor
INFO LocalTaskStateTracker2 - Starting the local task state tracker
INFO AbstractJobLauncher - Submitting task task_PullFromWikipedia_1422040355678_0 to run
INFO TaskExecutor - Submitting task task_PullFromWikipedia_1422040355678_0
INFO AbstractJobLauncher - Waiting for submitted tasks of job job_PullFromWikipedia_1422040355678 to complete... to complete...
INFO AbstractJobLauncher - 1 out of 1 tasks of job job_PullFromWikipedia_1422040355678 are running
INFO WikipediaExtractor - 5 record(s) retrieved for title NASA
INFO WikipediaExtractor - 5 record(s) retrieved for title LinkedIn
INFO WikipediaExtractor - 5 record(s) retrieved for title Parris_Cues
INFO WikipediaExtractor - 5 record(s) retrieved for title Barbara_Corcoran
INFO Task - Extracted 20 data records
INFO Fork-0 - Committing data of branch 0 of task task_PullFromWikipedia_1422040355678_0
INFO LocalTaskStateTracker2 - Task task_PullFromWikipedia_1422040355678_0 completed in 2334ms with state SUCCESSFUL
INFO AbstractJobLauncher - All tasks of job job_PullFromWikipedia_1422040355678 have completed
INFO TaskExecutor - Stopping the task executor
INFO LocalTaskStateTracker2 - Stopping the local task state tracker
INFO AbstractJobLauncher - Publishing job data of job job_PullFromWikipedia_1422040355678 with commit policy COMMIT_ON_FULL_SUCCESS
INFO AbstractJobLauncher - Persisting job/task states of job job_PullFromWikipedia_1422040355678
</code></pre>
<ul>
<li>After the job is done, stop Gobblin by running</li>
</ul>
<pre><code class="bash">gobblin service standalone stop
</code></pre>
<p>The job output is written in <code>GOBBLIN_WORK_DIR/job-output</code> folder as an Avro file.</p>
<p>To see the content of the job output, use the Avro tools to convert Avro to JSON. Download the latest version of Avro tools (e.g. avro-tools-1.8.1.jar):</p>
<pre><code class="bash">curl -O http://central.maven.org/maven2/org/apache/avro/avro-tools/1.8.1/avro-tools-1.8.1.jar
</code></pre>
<p>and run</p>
<pre><code class="bash">java -jar avro-tools-1.8.1.jar tojson --pretty [job_output].avro &gt; output.json
</code></pre>
<p><code>output.json</code> will contain all retrieved records in JSON format.</p>
<p>Note that since this job configuration file we used (<a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-example/src/main/resources/wikipedia.pull" rel="nofollow">wikipedia.pull</a>) doesn't specify a job schedule, the job will run immediately and will run only once. To schedule a job to run at a certain time and/or repeatedly, set the <code>job.schedule</code> property with a cron-based syntax. For example, <code>job.schedule=0 0/2 * * * ?</code> will run the job every two minutes. See <a href="http://www.quartz-scheduler.org/documentation/quartz-2.1.x/tutorials/crontrigger.html" rel="nofollow">this link</a> (Quartz CronTrigger) for more details.</p>
<h1 id="other-example-jobs">Other Example Jobs</h1>
<p>Besides the Wikipedia example, we have another example job <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-example/src/main/resources/simplejson.pull" rel="nofollow">SimpleJson</a>, which extracts records from JSON files and store them in Avro files.</p>
<p>To create your own jobs, simply implement the relevant interfaces such as <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/source/Source.java" rel="nofollow">Source</a>, <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/source/extractor/Extractor.java" rel="nofollow">Extractor</a>, <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/converter/Converter.java" rel="nofollow">Converter</a> and <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-api/src/main/java/org/apache/gobblin/writer/DataWriter.java" rel="nofollow">DataWriter</a>. In the job configuration file, set properties such as <code>source.class</code> and <code>converter.class</code> to point to these classes.</p>
<p>On a side note: while users are free to directly implement the Extractor interface (e.g., WikipediaExtractor), Gobblin also provides several extractor implementations based on commonly used protocols, e.g., <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-modules/gobblin-kafka-common/src/main/java/org/apache/gobblin/source/extractor/extract/kafka/KafkaExtractor.java" rel="nofollow">KafkaExtractor</a>, <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/source/extractor/extract/restapi/RestApiExtractor.java" rel="nofollow">RestApiExtractor</a>, <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-modules/gobblin-sql/src/main/java/org/apache/gobblin/source/jdbc/JdbcExtractor.java" rel="nofollow">JdbcExtractor</a>, <a href="https://github.com/apache/incubator-gobblin/blob/master/gobblin-core/src/main/java/org/apache/gobblin/source/extractor/extract/sftp/SftpExtractor.java" rel="nofollow">SftpExtractor</a>, etc. Users are encouraged to extend these classes to take advantage of existing implementations.</p>
</div>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="../Gobblin-Architecture/" class="btn btn-neutral float-right" title="Architecture">Next <span class="icon icon-circle-arrow-right"></span></a>
<a href="../Powered-By/" class="btn btn-neutral" title="Companies Powered By Gobblin"><span class="icon icon-circle-arrow-left"></span> Previous</a>
</div>
<hr/>
<div role="contentinfo">
<!-- Copyright etc -->
</div>
Built with <a href="http://www.mkdocs.org" rel="nofollow">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme" rel="nofollow">theme</a> provided by <a href="https://readthedocs.org" rel="nofollow">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<div class="rst-versions" role="note" style="cursor: pointer">
<span class="rst-current-version" data-toggle="rst-current-version">
<span><a href="../Powered-By/" style="color: #fcfcfc;">&laquo; Previous</a></span>
<span style="margin-left: 15px"><a href="../Gobblin-Architecture/" style="color: #fcfcfc">Next &raquo;</a></span>
</span>
</div>
<script>var base_url = '..';</script>
<script src="../js/theme.js" defer></script>
<script src="../js/extra.js" defer></script>
<script src="../search/main.js" defer></script>
</body>
</html>