blob: 64e6c49179bdd4300fe262b726fc5ed28faacdd3 [file] [log] [blame]
<!DOCTYPE html>
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<title>Eagle - Data Classification Tutorial</title>
<meta name="description" content="Eagle - Analyze Big Data Platforms for Security and Performance">
<meta name="keywords" content="Eagle, Hadoop, Security, Real Time">
<meta name="author" content="eBay Inc.">
<meta charset="utf-8">
<meta name="viewport" content="initial-scale=1">
<link rel="stylesheet" href="/css/animate.css">
<link rel="stylesheet" href="/css/bootstrap.min.css">
<link rel="stylesheet" href="/css/font-awesome.min.css">
<link rel="stylesheet" href="/css/misc.css">
<link rel="stylesheet" href="/css/style.css">
<link rel="stylesheet" href="/css/styles.css">
<link rel="stylesheet" href="/css/main.css">
<link rel="alternate" type="application/rss+xml" title="Eagle" href="http://goeagle.io/feed.xml" />
<link rel="shortcut icon" href="/images/favicon.png">
<!-- Baidu Analytics Tracking-->
<script>
var _hmt = _hmt || [];
(function() {
var hm = document.createElement("script");
hm.src = "//hm.baidu.com/hm.js?fedc55df2ea52777a679192e8f849ece";
var s = document.getElementsByTagName("script")[0];
s.parentNode.insertBefore(hm, s);
})();
</script>
<!-- Google Analytics Tracking -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-68929805-1', 'auto');
ga('send', 'pageview');
</script>
</head>
<body>
<!-- header start -->
<div id="home_page">
<div class="topbar">
<div class="container">
<div class="row" >
<nav class="navbar navbar-default">
<div class="container-fluid">
<!-- Brand and toggle get grouped for better mobile display -->
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1"> <span class="sr-only">Toggle navigation</span> <span class="icon-bar"></span> <span class="icon-bar"></span> <span class="icon-bar"></span> </button>
<a class="navbar-brand" href="/"><img src="/images/logo2.png" height="44px" style="margin-top:-7px"></a> </div>
<!-- Collect the nav links, forms, and other content for toggling -->
<!-- <div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1">
<ul class="nav navbar-nav navbar-right" id="top-menu">
<li><a class="menu" href="/#home_page">HOME</a></li>
<li><a class="menu" href="/docs/">DOCS</a></li>
<li><a class="menu" href="/#about_page">ABOUT</a></li>
<li><a class="menu" href="/#diagram_page">ARCHITECTURE</a></li>
<li><a class="menu" href="/#modules_page">MODULES</a></li>
<li><a class="menu" href="/#usecase_page">USE CASES</a></li>
<li>
</li>
</ul> -->
</div>
</div>
<!-- /.container-fluid -->
</nav>
</div>
</div>
</div>
<div class="headerimage">
<div class="flexslider">
<ul class="slides">
<li><img src="/images/slider/4.jpg" alt="Slide 1"></li>
</ul>
</div>
</div>
<div class="particles" style="height:40%"> </div><!---particles-->
<div class="slider-caption" style="top:80px;">
<div class="homewrapper">
<div class="hometitle">
<a href="/">
<img src="/images/feather.png" height="80px">
</a>
</div>
<div class="hometext">
<h2>Analyze Big Data Platforms For Security and Performance</h2>
<div class="social-buttons">
<a href="https://github.com/apache/eagle"><i class="fa fa-github"></i></a>
<a href="http://twitter.com/TheApacheEagle"><i class="fa fa-twitter"></i></a>
<a href="https://www.facebook.com/TheApacheEagle/"><i class="fa fa-facebook"></i></a>
<a href="#"><i class="fa fa-weixin"></i></a>
<!-- <a href="https://www.weibo.com/ApacheEagle/"><i class="fa fa-weibo"></i></a> -->
</div>
</div>
</div>
</div>
</div>
<!-- header end -->
<div class="container-fluid page-content">
<div class="row">
<div class="col-md-10 col-md-offset-1">
<!-- sidebar -->
<div class="col-xs-6 col-sm-3" id="sidebar" role="navigation">
<ul class="nav" id="adminnav">
<li class="heading">Getting Started</li>
<li class="sidenavli "><a href="/docs/index.html" data-permalink="/docs/tutorial/classification.html" id="">Introduction</a></li>
<li class="sidenavli "><a href="/docs/usecases.html" data-permalink="/docs/tutorial/classification.html" id="">Use Cases</a></li>
<li class="sidenavli "><a href="/docs/terminology.html" data-permalink="/docs/tutorial/classification.html" id="">Terminology</a></li>
<li class="sidenavli "><a href="/docs/ecosystem.html" data-permalink="/docs/tutorial/classification.html" id="">Ecosystem</a></li>
<li class="sidenavli "><a href="/docs/community.html" data-permalink="/docs/tutorial/classification.html" id="">Community</a></li>
<li class="sidenavli "><a href="/docs/FAQ.html" data-permalink="/docs/tutorial/classification.html" id="">FAQ</a></li>
<li class="divider"></li>
<li class="heading">Documentations</li>
<li class="sidenavli "><a href="/docs/latest/" data-permalink="/docs/tutorial/classification.html" id="">Latest version (v0.5.0)</a></li>
<li class="divider"></li>
<li class="heading">Download</li>
<li class="sidenavli "><a href="/docs/download-latest.html" data-permalink="/docs/tutorial/classification.html" id="">Latest version (v0.5.0)</a></li>
<li class="sidenavli "><a href="/docs/download.html" data-permalink="/docs/tutorial/classification.html" id="">Archived</a></li>
<li class="divider"></li>
<li class="heading">Supplement</li>
<li class="sidenavli "><a href="/docs/security.html" data-permalink="/docs/tutorial/classification.html" id="">Security</a></li>
<li class="divider"></li>
<li class="sidenavli">
<a href="mailto:dev@eagle.apache.org" target="_blank">Need Help?</a>
</li>
</ul>
</div>
<div class="col-xs-6 col-sm-9 page-main-content" style="margin-left: -15px" id="loadcontent">
<h1 class="page-header" style="margin-top: 0px">Data Classification Tutorial</h1>
<p>Apache Eagle data classification feature provides the ability to classify data with different levels of sensitivity.
Currently this feature is available ONLY for applications monitoring HDFS, Hive<sup id="fnref:HIVE"><a href="#fn:HIVE" class="footnote">1</a></sup> and HBase<sup id="fnref:HBASE"><a href="#fn:HBASE" class="footnote">2</a></sup>. For example, HdfsAuditLog, HiveQueryLog and HBaseSecurityLog.</p>
<p>The main content of this page are</p>
<ul>
<li>Cluster Connection</li>
<li>Data Classification</li>
</ul>
<h3 id="cluster-connection">Cluster Connection</h3>
<p>Here we give example configurations for HDFS, HBASE, and Hive. Suppose the cluster to monitor is Hortonwork sandbox. This configuration locates at the admin management page.</p>
<ul>
<li>
<p>HDFS</p>
<p><img src="/images/docs/hdfs-setup.png" alt="hdfs setup" /></p>
<ul>
<li>
<p>Base case</p>
<p>You may configure the default path for Apache Hadoop clients to connect remote hdfs namenode.</p>
<div class="highlighter-rouge"><pre class="highlight"><code> classification.fs.defaultFS=hdfs://sandbox.hortonworks.com:8020
</code></pre>
</div>
</li>
<li>
<p>HA case</p>
<p>Basically, you point your fs.defaultFS at your nameservice and let the client know how its configured (the backing namenodes) and how to fail over between them under the HA mode</p>
<div class="highlighter-rouge"><pre class="highlight"><code> classification.fs.defaultFS=hdfs://nameservice1
classification.dfs.nameservices=nameservice1
classification.dfs.ha.namenodes.nameservice1=namenode1,namenode2
classification.dfs.namenode.rpc-address.nameservice1.namenode1=hadoopnamenode01:8020
classification.dfs.namenode.rpc-address.nameservice1.namenode2=hadoopnamenode02:8020
classification.dfs.client.failover.proxy.provider.nameservice1=org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
</code></pre>
</div>
</li>
<li>
<p>Kerberos-secured cluster</p>
<p>For Kerberos-secured cluster, you need to get a keytab file and the principal from your admin, and configure “eagle.keytab.file” and “eagle.kerberos.principal” to authenticate its access.</p>
<div class="highlighter-rouge"><pre class="highlight"><code> classification.eagle.keytab.file=/EAGLE-HOME/.keytab/eagle.keytab
classification.eagle.kerberos.principal=eagle@SOMEWHERE.COM
</code></pre>
</div>
<p>If there is an exception about “invalid server principal name”, you may need to check the DNS resolver, or the data transfer , such as “dfs.encrypt.data.transfer”, “dfs.encrypt.data.transfer.algorithm”, “dfs.trustedchannel.resolver.class”, “dfs.datatransfer.client.encrypt”.</p>
</li>
</ul>
</li>
<li>Hive
<ul>
<li>
<p>Basic</p>
<div class="highlighter-rouge"><pre class="highlight"><code> classification.accessType=metastoredb_jdbc
classification.password=hive
classification.user=hive
classification.jdbcDriverClassName=com.mysql.jdbc.Driver
classification.jdbcUrl=jdbc:mysql://sandbox.hortonworks.com/hive?createDatabaseIfNotExist=true
</code></pre>
</div>
</li>
</ul>
</li>
<li>
<p>HBase</p>
<ul>
<li>
<p>Basic case</p>
<p>You need to sett “hbase.zookeeper.quorum”:”localhost” property and “hbase.zookeeper.property.clientPort” property.</p>
<div class="highlighter-rouge"><pre class="highlight"><code> classification.hbase.zookeeper.property.clientPort=2181
classification.hbase.zookeeper.quorum=localhost
</code></pre>
</div>
</li>
<li>
<p>Kerberos-secured cluster</p>
<p>According to your environment, you can add or remove some of the following properties. Here is the reference.</p>
<div class="highlighter-rouge"><pre class="highlight"><code> classification.hbase.zookeeper.property.clientPort=2181
classification.hbase.zookeeper.quorum=localhost
classification.hbase.security.authentication=kerberos
classification.hbase.master.kerberos.principal=hadoop/_HOST@EXAMPLE.COM
classification.zookeeper.znode.parent=/hbase
classification.eagle.keytab.file=/EAGLE-HOME/.keytab/eagle.keytab
classification.eagle.kerberos.principal=eagle@EXAMPLE.COM
</code></pre>
</div>
</li>
</ul>
</li>
</ul>
<p>Any questions on the Kerberos configuration in Eagle, please first check <a href="/docs/FAQ.html">FAQ</a></p>
<h3 id="data-classification">Data Classification</h3>
<p>After the configuration is updated, we can go back to the classification page on Eagle UI. Here we take HdfsAuditLog as an example to explain how to classify data and how to monitor sensitive data in Eagle.</p>
<h4 id="part-1-sensitivity-edit"><strong>Part 1: Sensitivity Edit</strong></h4>
<ul>
<li>
<p>add the sensitive mark to files/directories.</p>
<ul>
<li>
<p><strong>Basic</strong>: Label sensitivity files directly (<strong>recommended</strong>)</p>
<p><img src="/images/docs/hdfs-mark1.png" alt="HDFS classification" />
<img src="/images/docs/hdfs-mark2.png" alt="HDFS classification" />
<img src="/images/docs/hdfs-mark3.png" alt="HDFS classification" /></p>
</li>
<li>
<p><strong>Advanced</strong>: Import json file/content</p>
<p><img src="/images/docs/hdfs-import1.png" alt="HDFS classification" />
<img src="/images/docs/hdfs-import2.png" alt="HDFS classification" />
<img src="/images/docs/hdfs-import3.png" alt="HDFS classification" /></p>
</li>
</ul>
</li>
<li>
<p>remove sensitive mark on files/directories</p>
<ul>
<li>
<p><strong>Basic</strong>: remove label directly</p>
<p><img src="/images/docs/hdfs-delete1.png" alt="HDFS classification" />
<img src="/images/docs/hdfs-delete2.png" alt="HDFS classification" /></p>
</li>
<li>
<p><strong>Advanced</strong>: delete lin batch</p>
<p><img src="/images/docs/hdfs-remove.png" alt="HDFS classification" /></p>
</li>
</ul>
</li>
</ul>
<h4 id="part-2-monitor-sensitive-data">**Part 2: Monitor sensitive data</h4>
<p>You can mark a particular folder/file as “PRIVATE”. Once you have this information you can create policies using this label.</p>
<blockquote>
<p>For example: the following policy monitors all the operations to resources with sensitivity type “PRIVATE”.</p>
</blockquote>
<p><img src="/images/docs/sensitivity-policy.png" alt="sensitivity type policy" /></p>
<hr />
<h4 id="footnotes"><em>Footnotes</em></h4>
<div class="footnotes">
<ol>
<li id="fn:HIVE">
<p><em>All mentions of “hive” on this page represent Apache Hive.</em>&nbsp;<a href="#fnref:HIVE" class="reversefootnote">&#8617;</a></p>
</li>
<li id="fn:HBASE">
<p><em>All mentions of “hbase” on this page represent Apache HBase.</em>&nbsp;<a href="#fnref:HBASE" class="reversefootnote">&#8617;</a></p>
</li>
</ol>
</div>
</div><!--end of loadcontent-->
</div>
<!--end of centered content-->
</div>
</div>
<!--end of container-->
<!-- footer start -->
<div class="footerwrapper">
<div class="container">
<div class="row">
<div class="col-md-12"><div style="margin-left:auto; margin-right:auto; text-align:center;font-size: 12px">
<div>
</div>
<div>
<a href="http://www.apache.org">
<img id="asf-logo" alt="Apache Software Foundation" src="/images/apache-logo-small.gif">
</a>
</div>
<div>
Copyright © 2015 <a href="http://www.apache.org">The Apache Software Foundation</a>, Licensed under the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
</div>
<div>
Apache Eagle, Eagle, Apache Hadoop, Hadoop, Apache HBase, HBase, Apache Hive, Hive, Apache Ambari, Ambari, Apache Spark, Spark, Apache Kafka, Kafka, Apache Storm, Storm, Apache Maven, Maven, Apache Tomcat, Tomcat, Apache Derby, Derby, Apache Cassandra, Cassandra, Apache ZooKeeper, ZooKeeper, Apache, the Apache feather logo, and the Apache project logo are trademarks of The Apache Software Foundation.
</div>
</div></div>
</div>
</div>
</div>
<!-- footer end -->
<!-- JavaScripts -->
<script src="/js/jquery-1.11.1.min.js"></script>
<script src="/js/jquery.singlePageNav.js"></script>
<script src="/js/jquery.flexslider.js"></script>
<script src="/js/modernizr.min.js"></script>
<script src="/js/svg.js"></script>
<script>
/************** FlexSlider *********************/
$('.flexslider').flexslider({
animation: "fade",
directionNav: false
});
</script>
<script>
/************** FlexSlider *********************/
$('.flexslider').flexslider({
animation: "fade",
directionNav: false
});
</script>
</body>
</html>