blob: 9f3c06d15b83d39d9f869ed6cf2ba190eebcb901 [file] [log] [blame]
<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="author" content="Apache Software Foundation">
<link rel="shortcut icon" href="../../img/favicon.ico">
<title>Gobblin Compliance - Apache Gobblin</title>
<link href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700' rel='stylesheet' type='text/css'>
<link rel="stylesheet" href="../../css/theme.css" type="text/css" />
<link rel="stylesheet" href="../../css/theme_extra.css" type="text/css" />
<link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/styles/github.min.css">
<link href="../../css/extra.css" rel="stylesheet">
<script>
// Current page data
var mkdocs_page_name = "Gobblin Compliance";
var mkdocs_page_input_path = "user-guide/Gobblin-Compliance.md";
var mkdocs_page_url = null;
</script>
<script src="../../js/jquery-2.1.1.min.js" defer></script>
<script src="../../js/modernizr-2.8.3.min.js" defer></script>
<script src="//cdnjs.cloudflare.com/ajax/libs/highlight.js/9.12.0/highlight.min.js"></script>
<script>hljs.initHighlightingOnLoad();</script>
</head>
<body class="wy-body-for-nav" role="document">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
<div class="wy-side-nav-search">
<a href="../.." class="icon icon-home"> Apache Gobblin</a>
<div role="search">
<form id ="rtd-search-form" class="wy-form" action="../../search.html" method="get">
<input type="text" name="q" placeholder="Search docs" title="Type search term here" />
</form>
</div>
</div>
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<ul class="current">
<li class="toctree-l1">
<a class="" href="/">Home</a>
</li>
<li class="toctree-l1">
<a class="" href="../../Powered-By/">Companies Powered By Gobblin</a>
</li>
<li class="toctree-l1">
<a class="" href="../../Getting-Started/">Getting Started</a>
</li>
<li class="toctree-l1">
<a class="" href="../../Gobblin-Architecture/">Architecture</a>
</li>
<li class="toctree-l1">
<span class="caption-text">User Guide</span>
<ul class="subnav">
<li class="">
<a class="" href="../Working-with-Job-Configuration-Files/">Job Configuration Files</a>
</li>
<li class="">
<a class="" href="../Gobblin-Deployment/">Deployment</a>
</li>
<li class="">
<a class="" href="../Gobblin-as-a-Library/">Gobblin as a Library</a>
</li>
<li class="">
<a class="" href="../Gobblin-CLI/">Gobblin CLI</a>
</li>
<li class=" current">
<a class="current" href="./">Gobblin Compliance</a>
<ul class="subnav">
<li class="toctree-l3"><a href="#introduction">Introduction</a></li>
<li class="toctree-l3"><a href="#usage">Usage</a></li>
<li class="toctree-l3"><a href="#configuration">Configuration</a></li>
<li class="toctree-l3"><a href="#developer-guide">Developer Guide</a></li>
</ul>
</li>
<li class="">
<a class="" href="../Gobblin-on-Yarn/">Gobblin on Yarn</a>
</li>
<li class="">
<a class="" href="../Compaction/">Compaction</a>
</li>
<li class="">
<a class="" href="../State-Management-and-Watermarks/">State Management and Watermarks</a>
</li>
<li class="">
<a class="" href="../Working-with-the-ForkOperator/">Fork Operator</a>
</li>
<li class="">
<a class="" href="../Configuration-Properties-Glossary/">Configuration Glossary</a>
</li>
<li class="">
<a class="" href="../Source-schema-and-Converters/">Source schema and Converters</a>
</li>
<li class="">
<a class="" href="../Partitioned-Writers/">Partitioned Writers</a>
</li>
<li class="">
<a class="" href="../Monitoring/">Monitoring</a>
</li>
<li class="">
<a class="" href="../Gobblin-template/">Template</a>
</li>
<li class="">
<a class="" href="../Gobblin-Schedulers/">Schedulers</a>
</li>
<li class="">
<a class="" href="../Job-Execution-History-Store/">Job Execution History Store</a>
</li>
<li class="">
<a class="" href="../Building-Gobblin/">Building Gobblin</a>
</li>
<li class="">
<a class="" href="../Gobblin-genericLoad/">Generic Configuration Loading</a>
</li>
<li class="">
<a class="" href="../Hive-Registration/">Hive Registration</a>
</li>
<li class="">
<a class="" href="../Config-Management/">Config Management</a>
</li>
<li class="">
<a class="" href="../Docker-Integration/">Docker Integration</a>
</li>
<li class="">
<a class="" href="../Troubleshooting/">Troubleshooting</a>
</li>
<li class="">
<a class="" href="../FAQs/">FAQs</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Sources</span>
<ul class="subnav">
<li class="">
<a class="" href="../../sources/AvroFileSource/">Avro files</a>
</li>
<li class="">
<a class="" href="../../sources/CopySource/">File copy</a>
</li>
<li class="">
<a class="" href="../../sources/QueryBasedSource/">Query based</a>
</li>
<li class="">
<a class="" href="../../sources/RestApiSource/">Rest Api</a>
</li>
<li class="">
<a class="" href="../../sources/GoogleAnalyticsSource/">Google Analytics</a>
</li>
<li class="">
<a class="" href="../../sources/GoogleDriveSource/">Google Drive</a>
</li>
<li class="">
<a class="" href="../../sources/GoogleWebmaster/">Google Webmaster</a>
</li>
<li class="">
<a class="" href="../../sources/HadoopTextInputSource/">Hadoop Text Input</a>
</li>
<li class="">
<a class="" href="../../sources/HelloWorldSource/">Hello World</a>
</li>
<li class="">
<a class="" href="../../sources/HiveAvroToOrcSource/">Hive Avro-to-ORC</a>
</li>
<li class="">
<a class="" href="../../sources/HivePurgerSource/">Hive compliance purging</a>
</li>
<li class="">
<a class="" href="../../sources/SimpleJsonSource/">JSON</a>
</li>
<li class="">
<a class="" href="../../sources/KafkaSource/">Kafka</a>
</li>
<li class="">
<a class="" href="../../sources/MySQLSource/">MySQL</a>
</li>
<li class="">
<a class="" href="../../sources/OracleSource/">Oracle</a>
</li>
<li class="">
<a class="" href="../../sources/SalesforceSource/">Salesforce</a>
</li>
<li class="">
<a class="" href="../../sources/SftpSource/">SFTP</a>
</li>
<li class="">
<a class="" href="../../sources/SqlServerSource/">SQL Server</a>
</li>
<li class="">
<a class="" href="../../sources/TeradataSource/">Teradata</a>
</li>
<li class="">
<a class="" href="../../sources/WikipediaSource/">Wikipedia</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Sinks (Writers)</span>
<ul class="subnav">
<li class="">
<a class="" href="../../sinks/AvroHdfsDataWriter/">Avro HDFS</a>
</li>
<li class="">
<a class="" href="../../sinks/ParquetHdfsDataWriter/">Parquet HDFS</a>
</li>
<li class="">
<a class="" href="../../sinks/SimpleBytesWriter/">HDFS Byte array</a>
</li>
<li class="">
<a class="" href="../../sinks/ConsoleWriter/">Console</a>
</li>
<li class="">
<a class="" href="../../sinks/CouchbaseWriter/">Couchbase</a>
</li>
<li class="">
<a class="" href="../../sinks/Http/">HTTP</a>
</li>
<li class="">
<a class="" href="../../sinks/Gobblin-JDBC-Writer/">JDBC</a>
</li>
<li class="">
<a class="" href="../../sinks/Kafka/">Kafka</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Gobblin Adaptors</span>
<ul class="subnav">
<li class="">
<a class="" href="../../adaptors/Gobblin-Distcp/">Gobblin Distcp</a>
</li>
<li class="">
<a class="" href="../../adaptors/Hive-Avro-To-ORC-Converter/">Hive Avro-To-Orc Converter</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Case Studies</span>
<ul class="subnav">
<li class="">
<a class="" href="../../case-studies/Kafka-HDFS-Ingestion/">Kafka-HDFS Ingestion</a>
</li>
<li class="">
<a class="" href="../../case-studies/Publishing-Data-to-S3/">Publishing Data to S3</a>
</li>
<li class="">
<a class="" href="../../case-studies/Writing-ORC-Data/">Writing ORC Data</a>
</li>
<li class="">
<a class="" href="../../case-studies/Hive-Distcp/">Hive Distcp</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Gobblin Data Management</span>
<ul class="subnav">
<li class="">
<a class="" href="../../data-management/Gobblin-Retention/">Retention</a>
</li>
<li class="">
<a class="" href="../../data-management/DistcpNgEvents/">Distcp-NG events</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Gobblin Metrics</span>
<ul class="subnav">
<li class="">
<a class="" href="../../metrics/Gobblin-Metrics/">Quick Start</a>
</li>
<li class="">
<a class="" href="../../metrics/Existing-Reporters/">Existing Reporters</a>
</li>
<li class="">
<a class="" href="../../metrics/Metrics-for-Gobblin-ETL/">Metrics for Gobblin ETL</a>
</li>
<li class="">
<a class="" href="../../metrics/Gobblin-Metrics-Architecture/">Gobblin Metrics Architecture</a>
</li>
<li class="">
<a class="" href="../../metrics/Implementing-New-Reporters/">Implementing New Reporters</a>
</li>
<li class="">
<a class="" href="../../metrics/Gobblin-Metrics-Performance/">Gobblin Metrics Performance</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Developer Guide</span>
<ul class="subnav">
<li class="">
<a class="" href="../../developer-guide/Customization-for-New-Source/">Customization for New Source</a>
</li>
<li class="">
<a class="" href="../../developer-guide/Customization-for-Converter-and-Operator/">Customization for Converter and Operator</a>
</li>
<li class="">
<a class="" href="../../developer-guide/CodingStyle/">Code Style Guide</a>
</li>
<li class="">
<a class="" href="../../developer-guide/Gobblin-Compliance-Design/">Gobblin Compliance Design</a>
</li>
<li class="">
<a class="" href="../../developer-guide/IDE-setup/">IDE setup</a>
</li>
<li class="">
<a class="" href="../../developer-guide/Monitoring-Design/">Monitoring Design</a>
</li>
<li class="">
<a class="" href="../../developer-guide/Documentation-Architecture/">Documentation Architecture</a>
</li>
<li class="">
<a class="" href="../../developer-guide/Contributing/">Contributing</a>
</li>
<li class="">
<a class="" href="../../developer-guide/GobblinModules/">Gobblin Modules</a>
</li>
<li class="">
<a class="" href="../../developer-guide/HighLevelConsumer/">High Level Consumer</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Project</span>
<ul class="subnav">
<li class="">
<a class="" href="../../project/Feature-List/">Feature List</a>
</li>
<li class="">
<a class="" href="/people">Contributors and Team</a>
</li>
<li class="">
<a class="" href="../../project/Talks-and-Tech-Blogs/">Talks and Tech Blog Posts</a>
</li>
<li class="">
<a class="" href="../../project/Posts/">Posts</a>
</li>
</ul>
</li>
<li class="toctree-l1">
<span class="caption-text">Miscellaneous</span>
<ul class="subnav">
<li class="">
<a class="" href="../../miscellaneous/Camus-to-Gobblin-Migration/">Camus to Gobblin Migration</a>
</li>
<li class="">
<a class="" href="../../miscellaneous/Exactly-Once-Support/">Exactly Once Support</a>
</li>
</ul>
</li>
</ul>
</div>
&nbsp;
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="../..">Apache Gobblin</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="wy-breadcrumbs">
<li><a href="../..">Docs</a> &raquo;</li>
<li>User Guide &raquo;</li>
<li>Gobblin Compliance</li>
<li class="wy-breadcrumbs-aside">
<a href="https://github.com/apache/incubator-gobblin/edit/master/docs/user-guide/Gobblin-Compliance.md" rel="nofollow"> Edit on Gobblin</a>
</li>
</ul>
<hr/>
</div>
<div role="main">
<div class="section">
<div class="toc">
<ul>
<li><a href="#introduction">Introduction</a></li>
<li><a href="#usage">Usage</a></li>
<li><a href="#configuration">Configuration</a></li>
<li><a href="#developer-guide">Developer Guide</a></li>
</ul>
</div>
<h1 id="introduction">Introduction</h1>
<hr />
<p>The Gobblin Compliance module allows for data purging to meet regulatory compliance requirements. The module includes functionality for purging datasets and the associated operational support in production.</p>
<p>The purging is performed using Hive meaning that purging of datasets is supported in any format that Hive can read from and write to, including for example ORC and Parquet. Further the purger is built on top of the Gobblin framework which means that the fault-tolerance, scalability and flexibility provided by Gobblin is taken full advantage of.</p>
<h1 id="usage">Usage</h1>
<hr />
<p>As an example, let us assume that regulation requires that once a guest has checked out of a hotel, certain guest data needs to be purged within a certain number of days after the guest leaves. Hence, the goal is to purge the hotel's datasets of data associated with their guests after they have left in order to meet regulatory compliance requirements.</p>
<p>Hive databases and tables are setup to be purged by following these steps:</p>
<ol>
<li>Whitelisting of the database, or table for purging</li>
<li>Specifying the dataset descriptor for the tables to be purged</li>
<li>Specifying the JSON path of the compliance field in the dataset descriptor</li>
<li>The table which contains the list of ids whose associated data is to be purged</li>
<li>The name of the id column in the table to match against</li>
</ol>
<p>For example, in order to purge a Hive table named <code>tracking.event</code>, these properties are specified:</p>
<ul>
<li>Specify the purger whitelist to include tracking.event
<code>gobblin.compliance.dataset.whitelist=tracking.event</code></li>
<li>Add a TBLPROPERTY named <code>dataset.descriptor</code> to the tracking.event Hive table to specify the compliance field to match in the table as an escaped JSON (since it has to be a valid string):
<code>{\"complianceSpec\" : {\"identifierField\" : \"metadata.guestid\" }}</code></li>
<li>Specify the JSON field path for the compliance field (that evaluates to metadata.guestId)
<code>dataset.descriptor.fieldPath=complianceSpec.identifierField</code></li>
<li>Specify the table containing the list of ids to purge
<code>gobblin.compliance.complianceIdTable=u_purger.guestIds</code></li>
<li>Specify the name of the id in the table to match against
<code>gobblin.compliance.complianceId=guestId</code></li>
</ul>
<p>With these properties in place, a Gobblin job can be setup to purge the table. The work unit for the purger is an individual table partition. Hence the purger will iterate over all the partitions in the table, and purge each partition individually, processing as many partitions in parallel as specified (by the property 'gobblin.compliance.purger.maxWorkunits', which defaults to 5).</p>
<h1 id="configuration">Configuration</h1>
<hr />
<p>Configuration options for the Hive Purger include</p>
<table>
<thead>
<tr>
<th>Property</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>gobblin.compliance.dataset.whitelist</td>
<td>The list of databases/tables to purge, comma-separated</td>
</tr>
<tr>
<td>dataset.descriptor.fieldPath</td>
<td>The JSON field path specifying the compliance field</td>
</tr>
<tr>
<td>gobblin.compliance.complianceIdTable</td>
<td>The table containing the list of ids whose data needs to be purged</td>
</tr>
<tr>
<td>gobblin.compliance.complianceId</td>
<td>The name of the id column in the complianceIdTable to match against</td>
</tr>
<tr>
<td>gobblin.compliance.purger.maxWorkunits</td>
<td>The number of partitions to purge in parallel</td>
</tr>
<tr>
<td>gobblin.compliance.purger.policy.class</td>
<td>The policy class that specifies the criteria for purging, defaults to HivePurgerPolicy</td>
</tr>
<tr>
<td>gobblin.compliance.purger.commit.policy.class</td>
<td>The policy class that specifies the criteria for committing a purged dataset, defaults to HivePurgerCommitPolicy</td>
</tr>
</tbody>
</table>
<h1 id="developer-guide">Developer Guide</h1>
<p>The <a href="../developer-guide/Gobblin-Compliance-Design">Developer Guide</a> further describes the design of the module.</p>
</div>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="../Gobblin-on-Yarn/" class="btn btn-neutral float-right" title="Gobblin on Yarn">Next <span class="icon icon-circle-arrow-right"></span></a>
<a href="../Gobblin-CLI/" class="btn btn-neutral" title="Gobblin CLI"><span class="icon icon-circle-arrow-left"></span> Previous</a>
</div>
<hr/>
<div role="contentinfo">
<!-- Copyright etc -->
</div>
Built with <a href="http://www.mkdocs.org" rel="nofollow">MkDocs</a> using a <a href="https://github.com/snide/sphinx_rtd_theme" rel="nofollow">theme</a> provided by <a href="https://readthedocs.org" rel="nofollow">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<div class="rst-versions" role="note" style="cursor: pointer">
<span class="rst-current-version" data-toggle="rst-current-version">
<span><a href="../Gobblin-CLI/" style="color: #fcfcfc;">&laquo; Previous</a></span>
<span style="margin-left: 15px"><a href="../Gobblin-on-Yarn/" style="color: #fcfcfc">Next &raquo;</a></span>
</span>
</div>
<script>var base_url = '../..';</script>
<script src="../../js/theme.js" defer></script>
<script src="../../js/extra.js" defer></script>
<script src="../../search/main.js" defer></script>
</body>
</html>