blob: 67ab932fd4f89236597c6acbc9df880bf0c6662c [file] [log] [blame]
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
<title>Apache Flink: Monitoring and Controlling Networks of IoT Devices with Flink Stateful Functions</title>
<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
<link rel="icon" href="/favicon.ico" type="image/x-icon">
<!-- Bootstrap -->
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">
<link rel="stylesheet" href="/css/flink.css">
<link rel="stylesheet" href="/css/syntax.css">
<!-- Blog RSS feed -->
<link href="/blog/feed.xml" rel="alternate" type="application/rss+xml" title="Apache Flink Blog: RSS feed" />
<!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
<!-- We need to load Jquery in the header for custom google analytics event tracking-->
<script src="/js/jquery.min.js"></script>
<!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
<!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]-->
</head>
<body>
<!-- Main content. -->
<div class="container">
<div class="row">
<div id="sidebar" class="col-sm-3">
<!-- Top navbar. -->
<nav class="navbar navbar-default">
<!-- The logo. -->
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1">
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<div class="navbar-logo">
<a href="/">
<img alt="Apache Flink" src="/img/flink-header-logo.svg" width="147px" height="73px">
</a>
</div>
</div><!-- /.navbar-header -->
<!-- The navigation links. -->
<div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1">
<ul class="nav navbar-nav navbar-main">
<!-- First menu section explains visitors what Flink is -->
<!-- What is Stream Processing? -->
<!--
<li><a href="/streamprocessing1.html">What is Stream Processing?</a></li>
-->
<!-- What is Flink? -->
<li><a href="/flink-architecture.html">What is Apache Flink?</a></li>
<!-- What is Stateful Functions? -->
<li><a href="/stateful-functions.html">What is Stateful Functions?</a></li>
<!-- Use cases -->
<li><a href="/usecases.html">Use Cases</a></li>
<!-- Powered by -->
<li><a href="/poweredby.html">Powered By</a></li>
&nbsp;
<!-- Second menu section aims to support Flink users -->
<!-- Downloads -->
<li><a href="/downloads.html">Downloads</a></li>
<!-- Getting Started -->
<li class="dropdown">
<a class="dropdown-toggle" data-toggle="dropdown" href="#">Getting Started<span class="caret"></span></a>
<ul class="dropdown-menu">
<li><a href="https://ci.apache.org/projects/flink/flink-docs-release-1.11/getting-started/index.html" target="_blank">With Flink <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
<li><a href="https://ci.apache.org/projects/flink/flink-statefun-docs-release-2.1/getting-started/project-setup.html" target="_blank">With Flink Stateful Functions <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
<li><a href="/training.html">Training Course</a></li>
</ul>
</li>
<!-- Documentation -->
<li class="dropdown">
<a class="dropdown-toggle" data-toggle="dropdown" href="#">Documentation<span class="caret"></span></a>
<ul class="dropdown-menu">
<li><a href="https://ci.apache.org/projects/flink/flink-docs-release-1.11" target="_blank">Flink 1.11 (Latest stable release) <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
<li><a href="https://ci.apache.org/projects/flink/flink-docs-master" target="_blank">Flink Master (Latest Snapshot) <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
<li><a href="https://ci.apache.org/projects/flink/flink-statefun-docs-release-2.1" target="_blank">Flink Stateful Functions 2.1 (Latest stable release) <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
<li><a href="https://ci.apache.org/projects/flink/flink-statefun-docs-master" target="_blank">Flink Stateful Functions Master (Latest Snapshot) <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
</ul>
</li>
<!-- getting help -->
<li><a href="/gettinghelp.html">Getting Help</a></li>
<!-- Blog -->
<li><a href="/blog/"><b>Flink Blog</b></a></li>
<!-- Flink-packages -->
<li>
<a href="https://flink-packages.org" target="_blank">flink-packages.org <small><span class="glyphicon glyphicon-new-window"></span></small></a>
</li>
&nbsp;
<!-- Third menu section aim to support community and contributors -->
<!-- Community -->
<li><a href="/community.html">Community &amp; Project Info</a></li>
<!-- Roadmap -->
<li><a href="/roadmap.html">Roadmap</a></li>
<!-- Contribute -->
<li><a href="/contributing/how-to-contribute.html">How to Contribute</a></li>
<!-- GitHub -->
<li>
<a href="https://github.com/apache/flink" target="_blank">Flink on GitHub <small><span class="glyphicon glyphicon-new-window"></span></small></a>
</li>
&nbsp;
<!-- Language Switcher -->
<li>
<a href="/zh/2020/08/19/statefun.html">中文版</a>
</li>
</ul>
<ul class="nav navbar-nav navbar-bottom">
<hr />
<!-- Twitter -->
<li><a href="https://twitter.com/apacheflink" target="_blank">@ApacheFlink <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
<!-- Visualizer -->
<li class=" hidden-md hidden-sm"><a href="/visualizer/" target="_blank">Plan Visualizer <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
<hr />
<li><a href="https://apache.org" target="_blank">Apache Software Foundation <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
<li>
<style>
.smalllinks:link {
display: inline-block !important; background: none; padding-top: 0px; padding-bottom: 0px; padding-right: 0px; min-width: 75px;
}
</style>
<a class="smalllinks" href="https://www.apache.org/licenses/" target="_blank">License</a> <small><span class="glyphicon glyphicon-new-window"></span></small>
<a class="smalllinks" href="https://www.apache.org/security/" target="_blank">Security</a> <small><span class="glyphicon glyphicon-new-window"></span></small>
<a class="smalllinks" href="https://www.apache.org/foundation/sponsorship.html" target="_blank">Donate</a> <small><span class="glyphicon glyphicon-new-window"></span></small>
<a class="smalllinks" href="https://www.apache.org/foundation/thanks.html" target="_blank">Thanks</a> <small><span class="glyphicon glyphicon-new-window"></span></small>
</li>
</ul>
</div><!-- /.navbar-collapse -->
</nav>
</div>
<div class="col-sm-9">
<div class="row-fluid">
<div class="col-sm-12">
<div class="row">
<h1>Monitoring and Controlling Networks of IoT Devices with Flink Stateful Functions</h1>
<p><i></i></p>
<article>
<p>19 Aug 2020 Igal Shilman (<a href="https://twitter.com/IgalShilman">@IgalShilman</a>)</p>
<p>In this blog post, we’ll take a look at a class of use cases that is a natural fit for <a href="https://flink.apache.org/stateful-functions.html">Flink Stateful Functions</a>: monitoring and controlling networks of connected devices (often called the “Internet of Things” (IoT)).</p>
<p>IoT networks are composed of many individual, but interconnected components, which makes getting some kind of high-level insight into the status, problems, or optimization opportunities in these networks not trivial. Each individual device “sees” only its own state, which means that the status of groups of devices, or even the network as a whole, is often a complex aggregation of the individual devices’ state. Diagnosing, controlling, or optimizing these groups of devices thus requires distributed logic that analyzes the “bigger picture” and then acts upon it.</p>
<p>A powerful approach to implement this is using <em><a href="https://en.wikipedia.org/wiki/Digital_twin">digital twins</a></em>: each device has a corresponding virtual entity (i.e. the digital twin), which also captures their relationships and interactions. The digital twins track the status of their corresponding devices and send updates to other twins, representing groups (such as geographical regions) of devices. Those, in turn, handle the logic to obtain the network’s aggregated view, or this “bigger picture” we mentioned before.</p>
<h1 id="our-scenario-datacenter-monitoring-and-alerting">Our Scenario: Datacenter Monitoring and Alerting</h1>
<figure style="float:right;padding-left:1px;padding-top: 20px">
<img src="/img/blog/2020-08-18-statefun/rack.png" width="350px" />
<figcaption style="padding-top: 10px;text-align:center"><b>Fig.1</b> An oversimplified view of a data center.</figcaption>
</figure>
<p>There are many examples of the digital twins approach in the real world, such as <a href="https://www.infoq.com/presentations/tesla-vpp/">smart grids of batteries</a>, <a href="https://www.alibabacloud.com/solutions/intelligence-brain/city">smart cities</a>, or <a href="https://www.youtube.com/watch?v=9y27FJgz5-M">monitoring infrastructure software clusters</a>. In this blogpost, we’ll use the example of data center monitoring and alert correlation implemented with Stateful Functions.</p>
<p>Consider a very simplified view of a data center, consisting of many thousands of commodity servers arranged in server racks. Each server rack typically contains up to 40 servers, with a ToR (Top of the Rack) network switch connected to each server. The switches from all the racks connect through a larger switch (<strong>Fig. 1</strong>).</p>
<p>In this datacenter, many things can go wrong: a disk in a server can stop working, network cards can start dropping packets, or ToR switches might cease to function. The entire data center might also be affected by power supply degradation, causing servers to operate at reduced capacity. On-site engineers must be able to identify these incidents quickly and fix them promptly.</p>
<p>Diagnosing individual server failures is rather straightforward: take a recent history of metric reports from that particular server, analyse it and pinpoint the anomaly. On the other hand, other incidents only make sense “together”, because they share a common root cause. Diagnosing or predicting causes of networking degradation at a rack or datacenter level requires an aggregate view of metrics (such as package drop rates) from the individual machines and racks, and possibly some prediction model or diagnosis code that runs under certain conditions.</p>
<h2 id="monitoring-a-virtual-datacenter-via-digital-twins">Monitoring a Virtual Datacenter via Digital Twins</h2>
<p>For the sake of this blog post, our oversimplified data center has some servers and racks, each with a unique ID. Each server has a metrics-collecting daemon that publishes metrics to a message queue, and there is a provisioning service that operators will use to ask for server commission- and decommissioning.</p>
<div style="line-height:60%;">
<br />
</div>
<center>
<img src="/img/blog/2020-08-18-statefun/1.png" width="550px" alt="" />
</center>
<div style="line-height:60%;">
<br />
</div>
<p>Our application will consume these server metrics and commission/decommission events, and produce server/rack/datacenter alerts. There will also be an operator consuming any alerts triggered by the monitoring system. In the next section, we’ll show how this use case can be naturally modeled with Stateful Functions (StateFun).</p>
<h2 id="implementing-the-use-case-with-flink-statefun">Implementing the use case with Flink StateFun</h2>
<div class="alert alert-info">
<p>You can find the code for this example at: <a href="https://github.com/igalshilman/iot-statefun-blogpost">https://github.com/igalshilman/iot-statefun-blogpost</a></p>
</div>
<p>The basic building block for modeling a StateFun application is a <a href="https://ci.apache.org/projects/flink/flink-statefun-docs-release-2.1/concepts/application-building-blocks.html#stateful-functions"><em>stateful function</em></a>, which has the following properties:</p>
<ul>
<li>
<p>It has a logical unique address; and persisted, fault tolerant state, scoped to that address.</p>
</li>
<li>
<p>It can <em>react</em> to messages, both internal (or, sent from other stateful functions) and external (e.g. a message from Kafka).</p>
</li>
<li>
<p>Invocations of a specific function are serializable, so messages sent to a specific address are <strong>not</strong> executed concurrently.</p>
</li>
<li>
<p>There can be many billions of function instances in a single StateFun cluster.</p>
</li>
</ul>
<p>To model our use case, we’ll define three functions: <strong>ServerFun</strong>, <strong>RackFun</strong> and <strong>DataCenterFun</strong>.</p>
<p><strong>ServerFun</strong></p>
<p>Each physical server is represented with its <em>digital twin</em> stateful function. This function is responsible for:</p>
<ol>
<li>
<p>Maintaining a sliding window of incoming metrics.</p>
</li>
<li>
<p>Applying a model that decides whether or not to trigger an alert.</p>
</li>
<li>
<p>Alerting if metrics are missing for too long.</p>
</li>
<li>
<p>Notifying its containing <strong>RackFun</strong> about any open incidents.</p>
</li>
</ol>
<p><strong>RackFun</strong></p>
<p>While the <em>ServerFun</em> is responsible for identifying server-local incidents, we need a function that correlates incidents happening on the different servers deployed in the same rack and:</p>
<ol>
<li>
<p>Collects open incidents reported by the <strong>ServerFun</strong> functions.</p>
</li>
<li>
<p>Maintains an histogram of currently opened incidents on this rack.</p>
</li>
<li>
<p>Applies a correlation model to the individual incidents sent by the <strong>ServerFun</strong>, and reports high-level, related incidents as a single incident to the <strong>DataCenterFun</strong>.</p>
</li>
</ol>
<p><strong>DataCenterFun</strong></p>
<p>This function maintains a view of incidents across different racks in our datacenter.</p>
<div style="line-height:60%;">
<br />
</div>
<center>
<img src="/img/blog/2020-08-18-statefun/2.png" width="600px" alt="" />
</center>
<div style="line-height:60%;">
<br />
</div>
<p>To summarize our plan:</p>
<ul>
<li>
<p>Leaf functions ingest raw metric data (<span style="color:blue">blue</span> lines), and apply localized logic to trigger an alert.</p>
</li>
<li>
<p>Intermediate functions operate on already summarized events (<span style="color:orange">orange</span> lines) and correlate them into high-level events.</p>
</li>
<li>
<p>A root function correlates the high-level events across the intermediate functions and into a single <em>healthy/not healthy</em> value.</p>
</li>
</ul>
<h2 id="how-does-it-really-look">How does it really look?</h2>
<h3 id="serverfun">ServerFun</h3>
<center>
<img src="/img/blog/2020-08-18-statefun/3_1.png" width="600px" alt="" />
</center>
<div style="line-height:60%;">
<br />
</div>
<ol>
<li>This section associates a behaviour for every message that the function expects to be invoked with.</li>
<li>The <code>metricsHistory</code> buffer is our sliding window of the last 15 minutes worth of <code>ServerMetricReports</code>. Note that this buffer is configured to expire entries 15 minutes after they were written.</li>
<li><code>serverHealthState</code> represents the current physical server state, open incidents and so on.</li>
</ol>
<p>Let’s take a look at what happens when a <code>ServerMetricReport</code> message arrives:</p>
<center>
<img src="/img/blog/2020-08-18-statefun/3_2.png" width="600px" alt="" />
</center>
<div style="line-height:60%;">
<br />
</div>
<ol>
<li>Retrieve the previously computed <code>serverHealthState</code> that is kept in state.</li>
<li>Evaluate a model on the sliding window of the previous metric reports + the current metric reported + the previously computed server state to obtain an assessment of the current server health.</li>
<li>If the server is not believed to be healthy, emit an alert via an alerts topic, and also send a message to our containing rack with all the open incidents that this server currently has.</li>
</ol>
<div class="alert alert-warning">
<p>We’ll omit the other handlers for brevity, but it’s important to mention that <b>onTimer</b> makes sure that metric reports are coming in periodically, otherwise it’d trigger an alert stating that we didn’t hear from that server for a long time.</p>
</div>
<h3 id="rackfun">RackFun</h3>
<center>
<img src="/img/blog/2020-08-18-statefun/5.png" width="650px" alt="" />
</center>
<div style="line-height:60%;">
<br />
</div>
<ol>
<li>This function keeps a mapping between a <code>ServerId</code> and a set of open incidents on that server.</li>
<li>When new alerts are received, this function tries to correlate the alert with any other open alerts on that rack. If a correlated rack alert is present, this function notifies the <strong>DataCenterFun</strong> about it.</li>
</ol>
<h3 id="datacenterfun">DataCenterFun</h3>
<center>
<img src="/img/blog/2020-08-18-statefun/6.png" width="650px" alt="" />
</center>
<div style="line-height:60%;">
<br />
</div>
<ol>
<li>A persisted mapping between a <code>RackId</code> and the latest alert that rack reported.</li>
<li>Throughout the usage of ingress/egress pairs, this function can report back its current view of the world of what racks are currently known to be unhealthy.</li>
<li>An operator (via a front-end) can send a <code>GetUnhealthyRacks</code> message addressed to that <strong>DataCenterFun</strong>, and wait for the corresponding response <code>message(UnhealthyRacks)</code>. Whenever a rack reports <em>OK</em>, it’ll be removed from the unhealthy racks map.</li>
</ol>
<h2 id="conclusion">Conclusion</h2>
<p>This pattern — where each layer of functions performs a stateful aggregation of events sent from the previous layer (or the input) — is useful for a whole class of problems. And, although we used connected devices to motivate this use case, it’s not limited to the IoT domain.</p>
<center>
<img src="/img/blog/2020-08-18-statefun/7.png" width="500px" alt="" />
</center>
<div style="line-height:60%;">
<br />
</div>
<p>Stateful Functions provides the building blocks necessary for building complex distributed applications (here the digital twins that support analysis and interactions of the physical entities), while removing common complexities of distributed systems like service discovery, retires, circuit breakers, state management, scalability and similar challenges. If you’d like to learn more about Stateful Functions, head over to the official <a href="https://ci.apache.org/projects/flink/flink-statefun-docs-master/">documentation</a>, where you can also find more hands-on tutorials to try out yourself!</p>
</article>
</div>
<div class="row">
<div id="disqus_thread"></div>
<script type="text/javascript">
/* * * CONFIGURATION VARIABLES: EDIT BEFORE PASTING INTO YOUR WEBPAGE * * */
var disqus_shortname = 'stratosphere-eu'; // required: replace example with your forum shortname
/* * * DON'T EDIT BELOW THIS LINE * * */
(function() {
var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
})();
</script>
</div>
</div>
</div>
</div>
</div>
<hr />
<div class="row">
<div class="footer text-center col-sm-12">
<p>Copyright © 2014-2019 <a href="http://apache.org">The Apache Software Foundation</a>. All Rights Reserved.</p>
<p>Apache Flink, Flink®, Apache®, the squirrel logo, and the Apache feather logo are either registered trademarks or trademarks of The Apache Software Foundation.</p>
<p><a href="/privacy-policy.html">Privacy Policy</a> &middot; <a href="/blog/feed.xml">RSS feed</a></p>
</div>
</div>
</div><!-- /.container -->
<!-- Include all compiled plugins (below), or include individual files as needed -->
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery.matchHeight/0.7.0/jquery.matchHeight-min.js"></script>
<script src="/js/codetabs.js"></script>
<script src="/js/stickysidebar.js"></script>
<!-- Google Analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-52545728-1', 'auto');
ga('send', 'pageview');
</script>
</body>
</html>