blob: 0fcac35785f1f23009dbcbd090c75589a243699c [file] [log] [blame]
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1.0">
<title>Apache Cassandra | Apache Cassandra Documentation</title>
<link rel="stylesheet" href="../../assets/css/site.css">
<meta name="description" content="The Apache Cassandra Community">
<link rel="schema.dcterms" href="https://purl.org/dc/terms/">
<meta name="dcterms.subject" content="_">
<meta name="dcterms.identifier" content="master">
<meta name="generator" content="Antora 2.3.4">
<link rel="icon" href="../../assets/img/favicon.ico" type="image/x-icon">
<script>
const script = document.createElement("script");
const domain = window.location.hostname;
script.type = "text/javascript";
script.src = "https://plausible.cassandra.apache.org/js/plausible.js";
script.setAttribute("data-domain",domain);
script.setAttribute("defer",'true');
script.setAttribute("async",'true');
document.getElementsByTagName("head")[0].appendChild(script);
</script> </head>
<body class="single-post">
<div class="container mx-auto relative">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.6.0/jquery.min.js"></script>
<meta property="og:type" content="website" />
<meta property="og:description" content="" />
<meta property="og:url" content="/" />
<meta property="og:site_name" content="Apache Cassandra" />
<header id="top-nav">
<div class="inner relative">
<div class="header-social-icons text-right">
<a href="https://twitter.com/cassandra?lang=en" target="_blank" styles="margin-left: 20px;"><img src="../../assets/img/twitter-icon-circle-white.svg" alt="twitter icon" width="24"></a>
<a href="https://www.linkedin.com/company/apache-cassandra/" target="_blank" styles="margin-left: 20px;"><img src="../../assets/img/LI-In-Bug.png" alt="linked-in icon" width="24"></a>
<a href="https://www.youtube.com/c/PlanetCassandra" target="_blank" styles="margin-left: 20px;"><img src="../../assets/img/youtube-icon.png" alt="youtube icon" width="24"></a>
</div>
<div class="cf">
<div class="logo left"><a href="/"><img src="../../assets/img/logo-white-r.png" alt="cassandra logo"></a></div>
<div class="mobile-nav-icon right">
<img class="toggle-icon" src="../../assets/img/hamburger-nav.svg">
</div>
<ul class="main-nav nav-links right flex flex-vert-center flex-space-between">
<li>
<a class="nav-link hide-mobile">Get Started</a>
<ul class="sub-menu bg-white">
<li class="pa-micro">
<a href="/_/cassandra-basics.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-basics.png" alt="cassandra basics icon">
</div>
<div class="sub-nav-text teal py-small">
Cassandra Basics
</div>
</a>
</li>
<li class="pa-micro">
<a href="/_/quickstart.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-rocket.png" alt="cassandra basics icon">
</div>
<div class="sub-nav-text teal py-small">
Quickstart
</div>
</a>
</li>
<li class="pa-micro">
<a href="/_/ecosystem.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-ecosystem.png" alt="cassandra basics icon">
</div>
<div class="sub-nav-text teal py-small">
Ecosystem
</div>
</a>
</li>
</ul>
</li>
<li><a class="nav-link" href="/doc/latest/">Documentation</a></li>
<li>
<a class="nav-link" href="/_/community.html">Community</a>
<ul class="sub-menu bg-white">
<li class="pa-micro">
<a href="/_/community.html#code-of-conduct">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-welcome.png" alt="welcome icon">
</div>
<div class="sub-nav-text teal py-small">
Welcome
</div>
</a>
</li>
<li class="pa-micro hide-mobile">
<a href="/_/community.html#discussions">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-discussions.png" alt="discussions icon">
</div>
<div class="sub-nav-text teal py-small">
Discussions
</div>
</a>
</li>
<li class="pa-micro hide-mobile">
<a href="/_/community.html#project-governance">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-governance.png" alt="Governance icon">
</div>
<div class="sub-nav-text teal py-small">
Governance
</div>
</a>
</li>
<li class="pa-micro hide-mobile">
<a href="/_/community.html#how-to-contribute">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-contribute.png" alt="Contribute icon">
</div>
<div class="sub-nav-text teal py-small">
Contribute
</div>
</a>
</li>
<li class="pa-micro hide-mobile">
<a href="/_/community.html#meet-the-community">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-community.png" alt="Meet the Community icon">
</div>
<div class="sub-nav-text teal py-small">
Meet the Community
</div>
</a>
</li>
<li class="pa-micro hide-mobile">
<a href="/_/cassandra-catalyst-program.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-catalyst.png" alt="Catalyst icon">
</div>
<div class="sub-nav-text teal py-small">
Catalyst Program
</div>
</a>
</li>
<li class="pa-micro hide-mobile">
<a href="/_/events.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-events.png" alt="Events icon">
</div>
<div class="sub-nav-text teal py-small">
Events
</div>
</a>
</li>
</ul>
</li>
<li>
<a class="nav-link hide-mobile">Learn</a>
<ul class="sub-menu bg-white">
<li class="pa-micro">
<a href="/_/Apache-Cassandra-5.0-Moving-Toward-an-AI-Driven-Future.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-basics.png" alt="Basics icon">
</div>
<div class="sub-nav-text teal py-small">
Cassandra 5.0
</div>
</a>
</li>
<li class="pa-micro">
<a href="/_/case-studies.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-case-study.png" alt="Case Studies icon">
</div>
<div class="sub-nav-text teal py-small">
Case Studies
</div>
</a>
</li>
<li class="pa-micro">
<a href="/_/resources.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-resources.png" alt="Resources icon">
</div>
<div class="sub-nav-text teal py-small">
Resources
</div>
</a>
</li>
<li class="pa-micro">
<a href="/_/blog.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-blog.png" alt="Blog icon">
</div>
<div class="sub-nav-text teal py-small">
Blog
</div>
</a>
</li>
</ul>
</li>
<li><a class="nav-link btn btn--filled" href="/_/download.html">Download Now</a></li>
</ul>
</div>
</div>
</header>
<div class="hero hero--home grad">
<div class="eye"></div>
<div id="home-content" class="text-center flex flex-center flex-column relative z2 ma-xlarge">
<h1>Inside Cassandra: an interview with Marcel Birkner at Instana</h1>
<h3>November 17, 2021 | The Apache Cassandra Community</h3>
</div>
</div>
<div id="blog-post" class="flex-center py-large arrow">
<div class="blog-breadcrumb mb-medium">
<div class="inner inner--narrow">
<a href="/_/blog.html">« Back to the Apache Cassandra Blog</a>
</div>
</div>
<div class="post-content">
<div class="inner inner--narrow">
<div id="preamble">
<div class="sectionbody">
<div class="paragraph">
<p>Welcome to Inside Cassandra, a new series on the Apache Cassandra blog, where we interview engineers and devs in the community either about their use of Apache Cassandra to power their businesses or how they contribute to the project.</p>
</div>
<div class="paragraph">
<p>In our first interview, Patrick McFadin, Apache Cassandra community member, sat down with Marcel Birkner, Site Reliability Engineer (SRE) at Instana, an IBM Company. They discussed the unique challenge faced by the company as it strives to scale highly useful telemetry with a responsive query system. This article uses extracts from that conversation, which took place before IBM acquired the business.</p>
</div>
<div class="paragraph">
<p>One of the constants of life as an SRE is that <a href="https://www.instana.com/blog/life-of-an-sre-at-instana-things-break-all-the-time-in-distributed-systems-part-1-clickhouse/" target="_blank" rel="noopener">things break all the time</a>. If developers are expanding what the product can do, and operations are making sure everything keeps running smoothly, SREs often fill that annoying gap between theory and practice. They fix things as they break, set up alerts to enable them to know the next time it breaks, and keep a whole system humming despite its flaws.</p>
</div>
<div class="paragraph">
<p>Instana, which IBM acquired in 2020, delivers application performance management software for modern cloud-native environments and is designed to create closed-loop DevOps automation. Instana has developed a high-performance, high-scale architecture which can capture 100% of transactions in one-second intervals across an extensive distributed application in a hybrid cloud. It collects billions of metrics per day in real-time and uses Apache Cassandra to store and process the metric data at scale.</p>
</div>
</div>
</div>
<div class="sect1">
<h2 id="finding-the-sweet-spot-for-scaling"><a class="anchor" href="#finding-the-sweet-spot-for-scaling"></a>Finding the Sweet Spot for Scaling</h2>
<div class="sectionbody">
<div class="paragraph">
<p>Instana writes for some of its clusters are “250,000 to 1,200,000 metrics per second,” says Marcel, who has been an SRE at Instana for almost four years. By any standard, that’s a huge amount of data ingestion for a distributed workload. But instead of just throwing additional nodes at each problem, Marcel and his team have tried to find the scaling for each node that would perform the best. “The big challenge there is finding the right balance between resources and performance for each node. Finding the right CPU and memory size, because you neither want to have poor performance nor waste money.” That balance is critical for a team running similar demands on multiple clusters. “It took us a while to find the right sweet spot,” says Marcel.</p>
</div>
<div class="imageblock">
<div class="content">
<img src="../_images/blog/inside-Cassandra-Marcel-Birkner/image2.png" alt="sweet spot">
</div>
</div>
<div class="paragraph">
<p>Stats for one Metrics Cassandra cluster: 380k writes/sec, 9k reads/sec, overall great latency.</p>
</div>
<div class="paragraph">
<p>The challenge of knowing when to scale horizontally, when to scale vertically, and when to scale down to save money is something every operations or reliability engineer has faced in the real world. “There’s no single answer for all use cases,” says Marcel, but he sees close monitoring of your system as part of the solution: If you don’t know the effect that resource allocation is having on your service, there’s no way to optimize.</p>
</div>
</div>
</div>
<div class="sect1">
<h2 id="monitoring-a-monitoring-service"><a class="anchor" href="#monitoring-a-monitoring-service"></a>Monitoring a Monitoring Service</h2>
<div class="sectionbody">
<div class="paragraph">
<p>In many ways, monitoring, or its buzzword cousin observability, is the lifeblood of any SRE’s work. At a company whose product is monitoring, the act of monitoring your own services is ‘dog-fooding’ at its finest.</p>
</div>
<div class="paragraph">
<p>“We are a monitoring company, and we monitor the things that we use ourselves the best,” says Marcel. And really, it makes sense: any time a monitoring company is surprised by resource overruns or over-provisioning that has budgetary impacts, the jokes kind of write themselves.</p>
</div>
<div class="paragraph">
<p>Marcel monitors a stack where most components are running as part of a cluster, and, as such, there is fault-tolerance built-in thanks, in part, to Cassandra: “a single failing node does not impact our customers. Our system is built to handle these kinds of things. Nevertheless, it is critical to resolve these problems so that they do not escalate.”</p>
</div>
<div class="paragraph">
<p>When you begin on your journey to a well-monitored system, the first step is always the monitoring and alerts that come built-in. For Marcel and the team, Cassandra’s built-in tools were part of the picture from the outset. “Since we use Apache Cassandra heavily ourselves, there&#8217;s a lot of built-in alerts,” he told Patrick. A variety of metrics are crucial to understanding service health, such as: “the queuing mutation requests or whether there&#8217;s high CPU load now on nodes, or if there are network problems.” Together these give insight into critical questions like the node scaling mentioned above.</p>
</div>
<div class="imageblock">
<div class="content">
<img src="../_images/blog/inside-Cassandra-Marcel-Birkner/image3.png" alt="single metrics node 1">
</div>
</div>
<div class="imageblock">
<div class="content">
<img src="../_images/blog/inside-Cassandra-Marcel-Birkner/image5.png" alt="single metrics node 2">
</div>
</div>
<div class="imageblock">
<div class="content">
<img src="../_images/blog/inside-Cassandra-Marcel-Birkner/image4.png" alt="single metrics node 3">
</div>
</div>
<div class="paragraph">
<p>Detailed statistics for a single metrics Cassandra node.</p>
</div>
<div class="paragraph">
<p>When scaling, experimentation is vital. Monitoring is a crucial component here since operators need to answer whether modifying provisioning helped at all. “We’re able to monitor very closely what’s happening on the host while we tweak everything, and that lets us compare before and after the tweaks,” says Marcel.</p>
</div>
<div class="imageblock">
<div class="content">
<img src="../_images/blog/inside-Cassandra-Marcel-Birkner/image1.png" alt="1 second metrics resolution">
</div>
</div>
<div class="paragraph">
<p>1-second metrics resolution helps us identify even small spikes in metrics when making infrastructure changes.</p>
</div>
</div>
</div>
<div class="sect1">
<h2 id="using-monitoring-needs-to-grow-the-product"><a class="anchor" href="#using-monitoring-needs-to-grow-the-product"></a>Using Monitoring needs to Grow the Product</h2>
<div class="sectionbody">
<div class="paragraph">
<p>The most important benefit of ‘dog-fooding,’ or using your own product the way your customers do, is the way it highlights how the service needs to expand. For Marcel, that meant finding additional monitoring needs. “So the thing is, we monitor all aspects like network disk I/O on the host JVM [&#8230;&#8203;] if we are missing metrics, we just write the sensor for it, to get additional metrics.”</p>
</div>
<div class="paragraph">
<p>“If we have problems as an SRE team, and we don&#8217;t have the insights, we just tell our engineering team,” says Marcel. “Then they just enhance the product, either by new data routes or by new data that they&#8217;re collecting from the different technologies we monitor. And that makes our lives a lot easier.”</p>
</div>
<div class="paragraph">
<p>Head here for more information on <a href="/doc/latest/cassandra/operating/metrics.html">monitoring and metrics within Apache Cassandra</a>, and for a list of third-party projects, tools, and products, head to the project’s <a href="/_/ecosystem.html">Ecosystem page</a>.</p>
</div>
</div>
</div>
</div>
</div>
</div>
<footer class="grad grad--two flex-center pb-xlarge">
<div class="inner text-center z2 relative">
<h2 class="white py-small">Get started with Cassandra, fast.</h2>
<a id="footer-cta" href="/_/quickstart.html" class="btn btn--filled ma-medium">Quickstart Guide</a>
</div>
<div class="inner flex flex-distribute-items mt-xlarge z2 relative">
<div class="col-2">
<div id="footer-logo" class="logo logo--footer mb-medium"><img src="../../assets/img/logo-white-r.png" alt="Cassandra Logo"></div>
<p>Apache Cassandra<img src="../../assets/img/registered.svg" alt="®" style="width:18px;"> powers mission-critical deployments with improved performance and unparalleled levels of scale in the cloud.</p>
<div class="footer-social-icons">
<a href="https://twitter.com/cassandra?lang=en" target="_blank"><img src="../../assets/img/twitter-icon-circle-white.svg" alt="twitter icon" width="24"></a>
<a href="https://www.linkedin.com/company/apache-cassandra/" target="_blank"><img src="../../assets/img/LI-In-Bug.png" alt="linked-in icon" width="24"></a>
<a href="https://www.youtube.com/c/PlanetCassandra" target="_blank"><img src="../../assets/img/youtube-icon.png" alt="youtube icon" width="24"></a>
</div>
</div>
<div class="col-2 flex flex-center">
<ul class="columns-2">
<li class="mb-small"><a href="/">Home</a></li>
<li class="mb-small"><a href="/_/cassandra-basics.html">Cassandra Basics</a></li>
<li class="mb-small"><a href="/_/quickstart.html">Quickstart</a></li>
<li class="mb-small"><a href="/_/ecosystem.html">Ecosystem</a></li>
<li class="mb-small"><a href="/doc/latest/">Documentation</a></li>
<li class="mb-small"><a href="/_/community.html">Community</a></li>
<li class="mb-small"><a href="/_/case-studies.html">Case Studies</a></li>
<li class="mb-small"><a href="/_/resources.html">Resources</a></li>
<li class="mb-small"><a href="/_/blog.html">Blog</a></li>
</ul>
</div>
</div>
</footer>
<div class="lower-footer bg-white pa-medium">
<div class="flex flex-row flex-vert-center">
<div class="pr-medium"><img src="../../assets/img//feather-small.png" alt="ASF" width="20"></div>
<div class="pr-medium"><a href="http://www.apache.org/" target="_blank">Foundation</a></div>
<div class="pr-medium"><a href="https://www.apache.org/events/current-event.html" target="_blank">Events</a></div>
<div class="pr-medium"><a href="https://www.apache.org/licenses/" target="_blank">License</a></div>
<div class="pr-medium"><a href="https://www.apache.org/foundation/thanks" target="_blank">Thanks</a></div>
<div class="pr-medium"><a href="https://www.apache.org/security" target="_blank">Security</a></div>
<div class="pr-medium"><a href="https://privacy.apache.org/policies/privacy-policy-public.html" target="_blank">Privacy</a></div>
<div class="pr-medium"><a href="https://www.apache.org/foundation/sponsorship" target="_blank">Sponsorship</a></div>
</div>
<p class="my-medium">© 2009-<script>document.write(new Date().getFullYear())</script> <a href="https://apache.org" target="_blank">The Apache Software Foundation</a> under the terms of the Apache License 2.0. Apache, the Apache feather logo, Apache Cassandra, Cassandra, and the Cassandra logo, are either registered trademarks or trademarks of The Apache Software Foundation.</p>
</div>
<div id="fade" class="hidden"></div>
<div id="modal" class="hidden">
<div id="close-modal" class="cursor-pointer"><svg viewBox="0 0 24 24" width="24" height="24" stroke="currentColor" stroke-width="2" fill="none" stroke-linecap="round" stroke-linejoin="round" class="css-i6dzq1"><line x1="18" y1="6" x2="6" y2="18"></line><line x1="6" y1="6" x2="18" y2="18"></line></svg></div>
<div id="mod-content" class="vid-mod-content resp-container"></div>
</div>
<script>
jQuery(function(){
var windowW = $(window).width();
$(document)
.on('click','.mobile-nav-icon',function(){
$('.main-nav').fadeIn();
})
.on('click','.main-nav',function(){
if(windowW <= 1000){
$(this).fadeOut();
}
})
.on('click','#version-toggle',function(){
$(this).toggleClass('active');
$(this).next().fadeToggle();
})
.on('click','#mobile-docs-nav-burger', function(){
$(this).toggleClass('active');
$('.docs-nav').toggleClass('active');
});
var url = window.location.pathname;
var isQuickstart = url.includes('quickstart.html');
if(isQuickstart){
var footerCTA = document.getElementById('footer-cta');
footerCTA.innerHTML = 'Get latest updates';
footerCTA.setAttribute('href', '/_/blog.html');
}
});
</script>
</div>
</body>
<script>
jQuery(function(){
});
</script>
</html>