blob: 19f18319c85570d5bbe22655548f73858b68e276 [file] [log] [blame]
<!doctype html>
<!--[if lt IE 7]><html lang="en-US" class="no-js lt-ie9 lt-ie8 lt-ie7"><![endif]-->
<!--[if (IE 7)&!(IEMobile)]><html lang="en-US" class="no-js lt-ie9 lt-ie8"><![endif]-->
<!--[if (IE 8)&!(IEMobile)]><html lang="en-US" class="no-js lt-ie9"><![endif]-->
<!--[if gt IE 8]><!-->
<html lang="en-US" class="no-js">
<!--<![endif]-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<title>Jupyter Notebooks for Data Analysis - Apache Spot</title>
<meta name="HandheldFriendly" content="True">
<meta name="MobileOptimized" content="320">
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<link rel="apple-touch-icon" href="../../library/images/apple-touch-icon.png">
<link rel="icon" href="../../favicon.png">
<!--[if IE]>
<link rel="shortcut icon" href="http://spot.incubator.apache.org/favicon.ico">
<![endif]-->
<meta name="msapplication-TileColor" content="#f01d4f">
<meta name="msapplication-TileImage" content="../../library/images/win8-tile-icon.png">
<meta name="theme-color" content="#121212">
<link rel='dns-prefetch' href='//fonts.googleapis.com' />
<link rel='dns-prefetch' href='//s.w.org' />
<link rel="alternate" type="application/rss+xml" title="Apache Spot &raquo; Feed" href="../../feed/" />
<link rel='stylesheet' id='googleFonts-css' href='http://fonts.googleapis.com/css?family=Lato%3A400%2C700%2C400italic%2C700italic' type='text/css' media='all' />
<link rel='stylesheet' id='bones-stylesheet-css' href='../../library/css/style.css' type='text/css' media='all' />
<!--[if lt IE 9]>
<link rel='stylesheet' id='bones-ie-only-css' href='http://spot.incubator.apache.org/library/css/ie.css' type='text/css' media='all' />
<![endif]-->
<link rel='stylesheet' id='mm-css-css' href='../../library/css/meanmenu.css' type='text/css' media='all' />
<script type='text/javascript' src='../../library/js/libs/modernizr.custom.min.js'></script>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.1.1/jquery.min.js"></script>
<script type='text/javascript' src='../../library/js/jquery-migrate.min.js'></script>
<script type='text/javascript' src='../../library/js/jquery.meanmenu.js'></script>
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-87470508-1', 'auto');
ga('send', 'pageview');
</script>
</head>
<body class="single single-post">
<div id="container">
<header class="header">
<div id="inner-header" class="wrap cf">
<p id="logo" class="h1" itemscope itemtype="http://schema.org/Organization">
<a href="http://spot.incubator.apache.org/" rel="nofollow"><img src="../../library/images/logo.png" alt="Apache Spot" /></a>
</p>
<nav>
<ul id="menu-main-menu" class="nav top-nav cf">
<li id="menu-item-129" class="menu-item menu-item-type-custom menu-item-object-custom menu-item-129">
<a href="../../get-started">Get Started</a>
<ul class="sub-menu">
<li><a href="../../get-started">Get Started</a></li>
<li><a href="../../get-started/supporting-apache">Supporting Apache</a></li>
<li><a href="../../get-started/environment">Environment</a></li>
<li><a href="../../get-started/architecture">Architecture</a></li>
<li><a href="../../get-started/demo">Demo</a></li>
</ul>
</li>
<li id="menu-item-5" class="menu-item menu-item-type-custom menu-item-object-custom menu-item-5">
<a href="../../download">Download</a>
</li>
<li id="menu-item-130" class="menu-item menu-item-type-custom menu-item-object-custom menu-item-130">
<a href="../../community">Community</a>
<ul class="sub-menu com-sm">
<li class="dropmenu-head">Get in Touch</li>
<li><a href="../../community" class="mail">Mailing Lists</a></li>
<li class="divider"></li>
<li><a href="../../community/committers">Project Committers</a></li>
<li><a href="../../community/contribute">How to Contribute</a></li>
<li class="divider"></li>
<li class="dropmenu-head">Developer Resources</li>
<li><a href="https://github.com/apache/incubator-spot" target="_blank" class="github">Github</a></li>
<li><a href="https://issues.apache.org/jira/browse/SPOT/" target="_blank" class="jira">JIRA Issue Tracker</a></li>
<li><a href="https://cwiki.apache.org/confluence/pages/viewpage.action?spaceKey=SPOT&title=Apache+Spot+%28Incubating%29+Home" target="_blank" class="">Confluence Site</a></li> <li class="divider"></li>
<li class="dropmenu-head">Social Media</li>
<li><a href="https://twitter.com/ApacheSpot" target="_blank" class="twitter-icon">Twitter</a></li>
</ul>
</li>
<li id="menu-item-106" class="menu-item menu-item-type-custom menu-item-object-custom menu-item-106">
<a href="../../doc">Documentation</a>
</li>
<li class="menu-item menu-item-has-children">
<a href="#">Project Components</a>
<ul class="sub-menu">
<li><a href="../../project-components/ingestion">Ingestion</a></li>
<li><a href="../../project-components/machine-learning">Machine Learning</a></li>
<li><a href="../../project-components/suspicious-connects-analysis">Suspicous Connects Analysis</a></li>
<li><a href="../../project-components/visualization">Visualization</a></li>
<li class="under-dev">Under Development</li>
<li><a href="../../project-components/open-data-models">Open Data Models</a></li>
</ul>
</li>
<li id="menu-item-13" class="menu-item menu-item-type-post_type menu-item-object-page menu-item-13 active">
<a href="../../blog">Blog</a>
</li>
</ul>
</nav>
</div>
</header>
<div id="mobile-nav"></div>
<div id="content">
<div id="inner-content" class="wrap cf">
<main id="main" class="m-all t-2of3 d-5of7 cf" role="main" itemscope itemprop="mainContentOfPage" itemtype="http://schema.org/Blog">
<article id="post-136" class="cf post-136 post type-post status-publish format-standard hentry category-uncategorized" role="article" itemscope itemprop="blogPost" itemtype="http://schema.org/BlogPosting">
<header class="article-header entry-header">
<h1 class="entry-title single-title" itemprop="headline" rel="bookmark">Jupyter Notebooks for Data Analysis</h1>
<p class="byline entry-meta vcard">
<time class="updated entry-time" datetime="2016-09-22" itemprop="datePublished">
September 22, 2016
</time>
</span>
</p>
</header>
<section class="entry-content cf" itemprop="articleBody">
<p>
<strong>Why Does Apache Spot Include iPython notebooks? </strong>
</p>
<p>
The project team wants Apache Spot to be a versatile tool that can be used by anyone. This means that data scientists and developers need to be able to query and handle the source data to find all the information they need for their decision making. The iPython Notebook is an appropriate platform for easy data exploration. One of its biggest advantages is that it provides parallel and distributed computing to enable code execution and debugging in an interactive environment – thus the ‘i’ in iPython.
</p>
<p>
The iPython notebook is a web based interactive computational environment that provides access to the Python shell. While iPython notebooks were originally designed to work with the Python language, they support a number of other programming languages, including Ruby, Scala, Julia, R, Go, C, C++, Java and Perl. There are also multiple additional packages that can be used to get the most out of this highly-customizable tool.
</p>
<p>
Starting on version 4.0, most notebook functionalities are now part of the Project Jupyter, while iPython remains as the kernel to work with Python code in the notebooks.
</p>
<img src="../../library/images/iPython-1.png" alt="ipython" class="aligncenter size-full wp-image-140" />
<p>
<strong>IPython with Apache Spot for Network Threat Detection</strong>
</p>
<p>
<em>NOTE:  This is not intended to be a step-by-step tutorial on how to code a threat analysis in Apache Spot, but more like an introduction on how to approach the suspicions of a security breach.</em>
</p>
<p>
Although machine learning (ML) will do most of the work detecting anomalies in the traffic, Apache Spot also includes two notebook templates that can get you started on this. The <em>Threat_Investigation_master.ipynb</em> is designed to query the raw data table to find all connections in a day that are related to any threat you select – even connections that were not necessarily flagged as suspicious by ML on a first run. This gives us the chance to get a new data subset and here is where the fun begins.
</p>
<p>
If you suspect of a specific type of attack in your network, you can get the whole story by answering the Five ‘W’s.
</p>
<p>
<strong><em>What? </em></strong>
</p>
<p>
Maybe there’s been an increase in the logs collected by the system, which indicates abnormal amounts of communication in your network. Or, the amount of POST requests in your network have risen overnight. This is the mystery that needs to be solved by researching through the anomalies previously detected by ML.
</p>
<p>
<strong><em>Who?</em></strong>
</p>
<p>
Assuming you have a network context, you can identify the name of the infected machine inside the network, as well as the name of the IP or DNS on the other side of the connection (if it is a known host). If you don’t have a network context or are using DHCP, this can be a little tricky to detect using only Netflow logs. But, that’s where DNS and Proxy logs, come in handy. Including a network context file with Apache Spot is really simple and can go a long way when identifying a threat.
</p>
<p>
<strong><em>When?</em></strong>
</p>
<p>
To have a broader visibility on the attack, you can customize the queries on the Threat investigation notebook to review the data through a wider time lapse – instead of just checking through the current day. With this, you could find an increase of a certain type of requests to one (or many) URIs and predict its future behavior.
</p>
<p>
<strong><em>Where?</em></strong>
</p>
<p>
When working only with DNS, having a destination URL might not say much about where your information is going to, but Apache Spot allows you to connect with a geolocation database to identify the location of the suspected attackers IP. Taking advantage of this option, you can visually locate the other end of the connection on a map. You might find that it’s pointing to a country banned by your company, indicating a leak.
</p>
<p>
<strong><em>Why?</em></strong>
</p>
<p>
This answer to “why” will depend highly on the result of the analysis. For instance, an excessive amount of POST requests from one machine inside the network to an unidentified URI can indicate a data mining attack. Tracing back to patient zero, you can find that this could have originated with a phishing email, malicious software installed by an employee or a one-time visitor’s infected machine that connected to your network.
</p>
<p>
<strong>How to Get Answers to the Five Ws Questions</strong>
</p>
<p>
All of the previous questions can be answered by looking at the raw data collected. Although performing elaborated queries directly to your database can seem tempting, this type of analysis with Hive, or even Impala, can be very time consuming. A better approach would be to use Pandas to read and transform your dataset into a relational structured dataframe. This lets you work with as if it were an offline structured relational database.
</p>
<p>
Once you have your desired results and data subsets, you can use MatplotLib to easily graph your findings. (We cover this subject in more depth in another post.) Another advantage of the notebook is that you can download it as HTML or a PDF file to store locally and use it in a presentation – or just keep it for future reference.
</p>
<p>
<strong>Wrap Up</strong>
</p>
<p>
This post was meant to be just a brief introduction of how you can use iPython notebooks in Apache Spot to perform further data analysis and include it our executive report (in addition to the already included Story board). Although this is not the only way you can do this, it is a very interactive and fun way to do it. You’ll also see that the overall processing time is very short – thanks to the iPython notebook task parallelism ability.
</p>
<p>
We want to hear from YOU! Have you used iPython notebooks before? How do you feel about having this tool in Apache Spot? If you’re interested in further data analysis through interactive charts, a new post is coming soon on D3 and jQuery data visualization. Also, check back soon to read more on this and other Cybersecurity subjects.
</p>
</section>
<footer class="article-footer">
filed under: <a href="../../category/data-science/" rel="category tag">Data Science</a>, <a href="../../category/ipython-notebooks/" rel="category tag">Ipython Notebooks</a>, <a href="../../category/threat-analysis-tools/" rel="category tag">Threat Analysis Tools</a>
</footer>
</article>
</main>
<div id="sidebar1" class="sidebar m-all t-1of3 d-2of7 last-col cf" role="complementary">
<div id="recent-posts-2" class="widget widget_recent_entries">
<h4 class="widgettitle">Recent Posts</h4>
<ul>
<li>
<a href="../../blog/apache-spot-product-architecture-overview/">Apache Spot Product Architecture Overview</a>
</li>
<li>
<a href="../../blog/strength-in-numbers-why-consider-open-source-cybersecurity-analytics/">Strength in Numbers: Why Consider Open Source Cybersecurity Analytics</a>
</li>
<li>
<a href="../../blog/jupyter-notebooks-for-data-analysis/">Jupyter Notebooks for Data Analysis</a>
</li>
<li>
<a href="../../blog/apache-spot-and-cybersecurity-using-netflows-to-detect-threats-to-critical-infrastructure/">Apache Spot (Incubating) and Cybersecurity — Using NetFlows to Detect Threats to Critical Infrastructure</a>
</li>
<li>
<a href="../../blog/how-apache-spot-helps-create-well-stocked-data-lakes-and-catch-powerful-insights/">How Apache Spot (Incubating) Helps Create Well-Stocked Data Lakes and Catch Powerful Insights</a>
</li>
<li>
<a href="../../blog/apache-spot-3-most-asked-questions/">Apache Spot (Incubating): 3 Most-Asked Questions</a>
</li>
</ul>
</div>
<div id="archives-2" class="widget widget_archive">
<h4 class="widgettitle">Archives</h4>
<ul>
<li>
<a href='../../2017/03/'>March 2017</a>
</li>
<li>
<a href='../../2016/10/'>October 2016</a>
</li>
<li>
<a href='../../2016/09/'>September 2016</a>
</li>
<li>
<a href='../../2016/08/'>August 2016</a>
</li>
<li>
<a href='../../2016/03/'>March 2016</a>
</li>
</ul>
</div>
</div>
</div>
</div>
<footer class="footer" role="contentinfo" itemscope itemtype="http://schema.org/WPFooter">
<div id="inner-footer" class="wrap cf">
<p class="source-org copyright" style="text-align:center;">
&copy; 2019 Apache Spot.
</p>
</div>
</footer>
</div>
<a href="#0" class="cd-top">Top</a>
<script type='text/javascript' src='../../library/js/scripts.js'></script>
</body>
</html>