blob: 0652724661225bc074880c2cacf47fc69c663f2b [file] [log] [blame]
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
<title>Apache Flink: Flink and Prometheus: Cloud-native monitoring of streaming applications</title>
<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
<link rel="icon" href="/favicon.ico" type="image/x-icon">
<!-- Bootstrap -->
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">
<link rel="stylesheet" href="/css/flink.css">
<link rel="stylesheet" href="/css/syntax.css">
<!-- Blog RSS feed -->
<link href="/blog/feed.xml" rel="alternate" type="application/rss+xml" title="Apache Flink Blog: RSS feed" />
<!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
<!-- We need to load Jquery in the header for custom google analytics event tracking-->
<script src="/js/jquery.min.js"></script>
<!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
<!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]-->
</head>
<body>
<!-- Main content. -->
<div class="container">
<div class="row">
<div id="sidebar" class="col-sm-3">
<!-- Top navbar. -->
<nav class="navbar navbar-default">
<!-- The logo. -->
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1">
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<div class="navbar-logo">
<a href="/">
<img alt="Apache Flink" src="/img/flink-header-logo.svg" width="147px" height="73px">
</a>
</div>
</div><!-- /.navbar-header -->
<!-- The navigation links. -->
<div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1">
<ul class="nav navbar-nav navbar-main">
<!-- First menu section explains visitors what Flink is -->
<!-- What is Stream Processing? -->
<!--
<li><a href="/streamprocessing1.html">What is Stream Processing?</a></li>
-->
<!-- What is Flink? -->
<li><a href="/flink-architecture.html">What is Apache Flink?</a></li>
<!-- What is Stateful Functions? -->
<li><a href="/stateful-functions.html">What is Stateful Functions?</a></li>
<!-- Use cases -->
<li><a href="/usecases.html">Use Cases</a></li>
<!-- Powered by -->
<li><a href="/poweredby.html">Powered By</a></li>
&nbsp;
<!-- Second menu section aims to support Flink users -->
<!-- Downloads -->
<li><a href="/downloads.html">Downloads</a></li>
<!-- Getting Started -->
<li class="dropdown">
<a class="dropdown-toggle" data-toggle="dropdown" href="#">Getting Started<span class="caret"></span></a>
<ul class="dropdown-menu">
<li><a href="https://ci.apache.org/projects/flink/flink-docs-release-1.11/getting-started/index.html" target="_blank">With Flink <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
<li><a href="https://ci.apache.org/projects/flink/flink-statefun-docs-release-2.1/getting-started/project-setup.html" target="_blank">With Flink Stateful Functions <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
<li><a href="/training.html">Training Course</a></li>
</ul>
</li>
<!-- Documentation -->
<li class="dropdown">
<a class="dropdown-toggle" data-toggle="dropdown" href="#">Documentation<span class="caret"></span></a>
<ul class="dropdown-menu">
<li><a href="https://ci.apache.org/projects/flink/flink-docs-release-1.11" target="_blank">Flink 1.11 (Latest stable release) <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
<li><a href="https://ci.apache.org/projects/flink/flink-docs-master" target="_blank">Flink Master (Latest Snapshot) <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
<li><a href="https://ci.apache.org/projects/flink/flink-statefun-docs-release-2.1" target="_blank">Flink Stateful Functions 2.1 (Latest stable release) <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
<li><a href="https://ci.apache.org/projects/flink/flink-statefun-docs-master" target="_blank">Flink Stateful Functions Master (Latest Snapshot) <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
</ul>
</li>
<!-- getting help -->
<li><a href="/gettinghelp.html">Getting Help</a></li>
<!-- Blog -->
<li class="active"><a href="/blog/"><b>Flink Blog</b></a></li>
<!-- Flink-packages -->
<li>
<a href="https://flink-packages.org" target="_blank">flink-packages.org <small><span class="glyphicon glyphicon-new-window"></span></small></a>
</li>
&nbsp;
<!-- Third menu section aim to support community and contributors -->
<!-- Community -->
<li><a href="/community.html">Community &amp; Project Info</a></li>
<!-- Roadmap -->
<li><a href="/roadmap.html">Roadmap</a></li>
<!-- Contribute -->
<li><a href="/contributing/how-to-contribute.html">How to Contribute</a></li>
<!-- GitHub -->
<li>
<a href="https://github.com/apache/flink" target="_blank">Flink on GitHub <small><span class="glyphicon glyphicon-new-window"></span></small></a>
</li>
&nbsp;
<!-- Language Switcher -->
<li>
<!-- link to the Chinese home page when current is blog page -->
<a href="/zh">中文版</a>
</li>
</ul>
<ul class="nav navbar-nav navbar-bottom">
<hr />
<!-- Twitter -->
<li><a href="https://twitter.com/apacheflink" target="_blank">@ApacheFlink <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
<!-- Visualizer -->
<li class=" hidden-md hidden-sm"><a href="/visualizer/" target="_blank">Plan Visualizer <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
<hr />
<li><a href="https://apache.org" target="_blank">Apache Software Foundation <small><span class="glyphicon glyphicon-new-window"></span></small></a></li>
<li>
<style>
.smalllinks:link {
display: inline-block !important; background: none; padding-top: 0px; padding-bottom: 0px; padding-right: 0px; min-width: 75px;
}
</style>
<a class="smalllinks" href="https://www.apache.org/licenses/" target="_blank">License</a> <small><span class="glyphicon glyphicon-new-window"></span></small>
<a class="smalllinks" href="https://www.apache.org/security/" target="_blank">Security</a> <small><span class="glyphicon glyphicon-new-window"></span></small>
<a class="smalllinks" href="https://www.apache.org/foundation/sponsorship.html" target="_blank">Donate</a> <small><span class="glyphicon glyphicon-new-window"></span></small>
<a class="smalllinks" href="https://www.apache.org/foundation/thanks.html" target="_blank">Thanks</a> <small><span class="glyphicon glyphicon-new-window"></span></small>
</li>
</ul>
</div><!-- /.navbar-collapse -->
</nav>
</div>
<div class="col-sm-9">
<div class="row-fluid">
<div class="col-sm-12">
<div class="row">
<h1>Flink and Prometheus: Cloud-native monitoring of streaming applications</h1>
<p><i></i></p>
<article>
<p>11 Mar 2019 Maximilian Bode, TNG Technology Consulting (<a href="https://twitter.com/mxpbode">@mxpbode</a>)</p>
<p>This blog post describes how developers can leverage Apache Flink’s built-in <a href="https://ci.apache.org/projects/flink/flink-docs-release-1.7/monitoring/metrics.html">metrics system</a> together with <a href="https://prometheus.io/">Prometheus</a> to observe and monitor streaming applications in an effective way. This is a follow-up post from my <a href="https://flink-forward.org/">Flink Forward</a> Berlin 2018 talk (<a href="https://www.slideshare.net/MaximilianBode1/monitoring-flink-with-prometheus">slides</a>, <a href="https://www.ververica.com/flink-forward-berlin/resources/monitoring-flink-with-prometheus">video</a>). We will cover some basic Prometheus concepts and why it is a great fit for monitoring Apache Flink stream processing jobs. There is also an example to showcase how you can utilize Prometheus with Flink to gain insights into your applications and be alerted on potential degradations of your Flink jobs.</p>
<h2 id="why-prometheus">Why Prometheus?</h2>
<p>Prometheus is a metrics-based monitoring system that was originally created in 2012. The system is completely open-source (under the Apache License 2) with a vibrant community behind it and it has graduated from the Cloud Native Foundation last year – a sign of maturity, stability and production-readiness. As we mentioned, the system is based on metrics and it is designed to measure the overall health, behavior and performance of a service. Prometheus features a multi-dimensional data model as well as a flexible query language. It is designed for reliability and can easily be deployed in traditional or containerized environments. Some of the important Prometheus concepts are:</p>
<ul>
<li>
<p><strong>Metrics:</strong> Prometheus defines metrics as floats of information that change in time. These time series have millisecond precision.</p>
</li>
<li>
<p><strong>Labels</strong> are the key-value pairs associated with time series that support Prometheus’ flexible and powerful data model – in contrast to hierarchical data structures that one might experience with traditional metrics systems.</p>
</li>
<li>
<p><strong>Scrape:</strong> Prometheus is a pull-based system and fetches (“scrapes”) metrics data from specified sources that expose HTTP endpoints with a text-based format.</p>
</li>
<li>
<p><strong>PromQL</strong> is Prometheus’ <a href="https://prometheus.io/docs/prometheus/latest/querying/basics/">query language</a>. It can be used for both building dashboards and setting up alert rules that will trigger when specific conditions are met.</p>
</li>
</ul>
<p>When considering metrics and monitoring systems for your Flink jobs, there are many <a href="https://ci.apache.org/projects/flink/flink-docs-release-1.7/monitoring/metrics.html">options</a>. Flink offers native support for exposing data to Prometheus via the <code>PrometheusReporter</code> configuration. Setting up this integration is very easy.</p>
<p>Prometheus is a great choice as usually Flink jobs are not running in isolation but in a greater context of microservices. For making metrics available to Prometheus from other parts of a larger system, there are two options: There exist <a href="https://prometheus.io/docs/instrumenting/clientlibs/">libraries for all major languages</a> to instrument other applications. Additionally, there is a wide variety of <a href="https://prometheus.io/docs/instrumenting/exporters/">exporters</a>, which are tools that expose metrics of third-party systems (like databases or Apache Kafka) as Prometheus metrics.</p>
<h2 id="prometheus-and-flink-in-action">Prometheus and Flink in Action</h2>
<p>We have provided a <a href="https://github.com/mbode/flink-prometheus-example">GitHub repository</a> that demonstrates the integration described above. To have a look, clone the repository, make sure <a href="https://docs.docker.com/install/">Docker</a> is installed and run:</p>
<div class="highlight"><pre><code>./gradlew composeUp
</code></pre></div>
<p>This builds a Flink job using the build tool <a href="https://gradle.org/">Gradle</a> and starts up a local environment based on <a href="https://docs.docker.com/compose/">Docker Compose</a> running the job in a <a href="https://ci.apache.org/projects/flink/flink-docs-release-1.7/ops/deployment/docker.html#flink-job-cluster">Flink job cluster</a> (reachable at <a href="http://localhost:8081/">http://localhost:8081</a>) as well as a Prometheus instance (<a href="http://localhost:9090/">http://localhost:9090</a>).</p>
<center>
<img src="/img/blog/2019-03-11-prometheus-monitoring/prometheusexamplejob.png" width="600px" alt="PrometheusExampleJob in Flink Web UI" />
<br />
<i><small>Job graph and custom metric for example job in Flink web interface.</small></i>
</center>
<p><br /></p>
<p>The <code>PrometheusExampleJob</code> has three operators: Random numbers up to 10,000 are generated, then a map counts the events and creates a histogram of the values passed through. Finally, the events are discarded without further output. The very simple code below is from the second operator. It illustrates how easy it is to add custom metrics relevant to your business logic into your Flink job.</p>
<div class="highlight"><pre><code class="language-java"><span class="kd">class</span> <span class="nc">FlinkMetricsExposingMapFunction</span> <span class="kd">extends</span> <span class="n">RichMapFunction</span><span class="o">&lt;</span><span class="n">Integer</span><span class="o">,</span> <span class="n">Integer</span><span class="o">&gt;</span> <span class="o">{</span>
<span class="kd">private</span> <span class="kd">transient</span> <span class="n">Counter</span> <span class="n">eventCounter</span><span class="o">;</span>
<span class="nd">@Override</span>
<span class="kd">public</span> <span class="kt">void</span> <span class="nf">open</span><span class="o">(</span><span class="n">Configuration</span> <span class="n">parameters</span><span class="o">)</span> <span class="o">{</span>
<span class="n">eventCounter</span> <span class="o">=</span> <span class="n">getRuntimeContext</span><span class="o">().</span><span class="na">getMetricGroup</span><span class="o">().</span><span class="na">counter</span><span class="o">(</span><span class="s">&quot;events&quot;</span><span class="o">);</span>
<span class="o">}</span>
<span class="nd">@Override</span>
<span class="kd">public</span> <span class="n">Integer</span> <span class="nf">map</span><span class="o">(</span><span class="n">Integer</span> <span class="n">value</span><span class="o">)</span> <span class="o">{</span>
<span class="n">eventCounter</span><span class="o">.</span><span class="na">inc</span><span class="o">();</span>
<span class="k">return</span> <span class="n">value</span><span class="o">;</span>
<span class="o">}</span>
<span class="o">}</span></code></pre></div>
<center><i><small>Excerpt from <a href="https://github.com/mbode/flink-prometheus-example/blob/master/src/main/java/com/github/mbode/flink_prometheus_example/FlinkMetricsExposingMapFunction.java">FlinkMetricsExposingMapFunction.java</a> demonstrating custom Flink metric.</small></i></center>
<h2 id="configuring-prometheus-with-flink">Configuring Prometheus with Flink</h2>
<p>To start monitoring Flink with Prometheus, the following steps are necessary:</p>
<ol>
<li>
<p>Make the <code>PrometheusReporter</code> jar available to the classpath of the Flink cluster (it comes with the Flink distribution):</p>
<div class="highlight"><pre><code> cp /opt/flink/opt/flink-metrics-prometheus-1.7.2.jar /opt/flink/lib
</code></pre></div>
</li>
<li>
<p><a href="https://ci.apache.org/projects/flink/flink-docs-release-1.7/monitoring/metrics.html#reporter">Configure the reporter</a> in Flink’s <em>flink-conf.yaml</em>. All job managers and task managers will expose the metrics on the configured port.</p>
<div class="highlight"><pre><code> metrics.reporters: prom
metrics.reporter.prom.class: org.apache.flink.metrics.prometheus.PrometheusReporter
metrics.reporter.prom.port: 9999
</code></pre></div>
</li>
<li>
<p>Prometheus needs to know where to scrape metrics. In a static scenario, you can simply <a href="https://prometheus.io/docs/prometheus/latest/configuration/configuration/">configure Prometheus</a> in <em>prometheus.yml</em> with the following:</p>
<div class="highlight"><pre><code> scrape_configs:
- job_name: 'flink'
static_configs:
- targets: ['job-cluster:9999', 'taskmanager1:9999', 'taskmanager2:9999']
</code></pre></div>
<p>In more dynamic scenarios we recommend using Prometheus’ service discovery support for different platforms such as Kubernetes, AWS EC2 and more.</p>
</li>
</ol>
<p>Both custom metrics are now available in Prometheus:</p>
<center>
<img src="/img/blog/2019-03-11-prometheus-monitoring/prometheus.png" width="600px" alt="Prometheus web UI with example metric" />
<br />
<i><small>Example metric in Prometheus web UI.</small></i>
</center>
<p><br /></p>
<p>More technical metrics from the Flink cluster (like checkpoint sizes or duration, Kafka offsets or resource consumption) are also available. If you are interested, you can check out the HTTP endpoints exposing all Prometheus metrics for the job managers and the two task managers on <a href="http://localhost:9249/metrics">http://localhost:9249</a>, <a href="http://localhost:9250/metrics">http://localhost:9250</a> and <a href="http://localhost:9251/metrics">http://localhost:9251</a>, respectively.</p>
<p>To test Prometheus’ alerting feature, kill one of the Flink task managers via</p>
<div class="highlight"><pre><code>docker kill taskmanager1
</code></pre></div>
<p>Our Flink job can recover from this partial failure via the mechanism of <a href="https://ci.apache.org/projects/flink/flink-docs-release-1.7/dev/stream/state/checkpointing.html">Checkpointing</a>. Nevertheless, after roughly one minute (as configured in the alert rule) the following alert will fire:</p>
<center>
<img src="/img/blog/2019-03-11-prometheus-monitoring/prometheusalerts.png" width="600px" alt="Prometheus web UI with example alert" />
<br />
<i><small>Example alert in Prometheus web UI.</small></i>
</center>
<p><br /></p>
<p>In real-world situations alerts like this one can be routed through a component called <a href="https://prometheus.io/docs/alerting/alertmanager/">Alertmanager</a> and be grouped into notifications to systems like email, PagerDuty or Slack.</p>
<p>Go ahead and play around with the setup, and check out the <a href="https://grafana.com/grafana">Grafana</a> instance reachable at <a href="http://localhost:3000/">http://localhost:3000</a> (credentials <em>admin:flink</em>) for visualizing Prometheus metrics. If there are any questions or problems, feel free to <a href="https://github.com/mbode/flink-prometheus-example/issues">create an issue</a>. Once finished, do not forget to tear down the setup via</p>
<div class="highlight"><pre><code>./gradlew composeDown
</code></pre></div>
<p><br /></p>
<h2 id="conclusion">Conclusion</h2>
<p>Using Prometheus together with Flink provides an easy way for effective monitoring and alerting of your Flink jobs. Both projects have exciting and vibrant communities behind them with new developments and additions scheduled for upcoming releases. We encourage you to try the two technologies together as it has immensely improved our insights into Flink jobs running in production.</p>
</article>
</div>
<div class="row">
<div id="disqus_thread"></div>
<script type="text/javascript">
/* * * CONFIGURATION VARIABLES: EDIT BEFORE PASTING INTO YOUR WEBPAGE * * */
var disqus_shortname = 'stratosphere-eu'; // required: replace example with your forum shortname
/* * * DON'T EDIT BELOW THIS LINE * * */
(function() {
var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
})();
</script>
</div>
</div>
</div>
</div>
</div>
<hr />
<div class="row">
<div class="footer text-center col-sm-12">
<p>Copyright © 2014-2019 <a href="http://apache.org">The Apache Software Foundation</a>. All Rights Reserved.</p>
<p>Apache Flink, Flink®, Apache®, the squirrel logo, and the Apache feather logo are either registered trademarks or trademarks of The Apache Software Foundation.</p>
<p><a href="/privacy-policy.html">Privacy Policy</a> &middot; <a href="/blog/feed.xml">RSS feed</a></p>
</div>
</div>
</div><!-- /.container -->
<!-- Include all compiled plugins (below), or include individual files as needed -->
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery.matchHeight/0.7.0/jquery.matchHeight-min.js"></script>
<script src="/js/codetabs.js"></script>
<script src="/js/stickysidebar.js"></script>
<!-- Google Analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-52545728-1', 'auto');
ga('send', 'pageview');
</script>
</body>
</html>