blob: 81fb946bf83ae5e858dafe7bf1d15f3628083387 [file] [log] [blame]
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1.0">
<title>Apache Cassandra | Apache Cassandra Documentation</title>
<link rel="stylesheet" href="../../assets/css/site.css">
<meta name="description" content="Testing Apache Cassandra">
<link rel="schema.dcterms" href="https://purl.org/dc/terms/">
<meta name="dcterms.subject" content="_">
<meta name="dcterms.identifier" content="master">
<meta name="generator" content="Antora 2.3.4">
<link rel="icon" href="../../assets/img/favicon.ico" type="image/x-icon">
<script>
const script = document.createElement("script");
const domain = window.location.hostname;
script.type = "text/javascript";
script.src = "https://plausible.cassandra.apache.org/js/plausible.js";
script.setAttribute("data-domain",domain);
script.setAttribute("defer",'true');
script.setAttribute("async",'true');
document.getElementsByTagName("head")[0].appendChild(script);
</script> </head>
<body class="single-post">
<div class="container mx-auto relative">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.6.0/jquery.min.js"></script>
<meta property="og:type" content="website" />
<meta property="og:description" content="" />
<meta property="og:url" content="/" />
<meta property="og:site_name" content="Apache Cassandra" />
<header id="top-nav">
<div class="inner relative">
<div class="header-social-icons text-right">
<a href="https://twitter.com/cassandra?lang=en" target="_blank" styles="margin-left: 20px;"><img src="../../assets/img/twitter-icon-circle-white.svg" alt="twitter icon" width="24"></a>
<a href="https://www.linkedin.com/company/apache-cassandra/" target="_blank" styles="margin-left: 20px;"><img src="../../assets/img/LI-In-Bug.png" alt="linked-in icon" width="24"></a>
<a href="https://www.youtube.com/c/PlanetCassandra" target="_blank" styles="margin-left: 20px;"><img src="../../assets/img/youtube-icon.png" alt="youtube icon" width="24"></a>
</div>
<div class="cf">
<div class="logo left"><a href="/"><img src="../../assets/img/logo-white-r.png" alt="cassandra logo"></a></div>
<div class="mobile-nav-icon right">
<img class="toggle-icon" src="../../assets/img/hamburger-nav.svg">
</div>
<ul class="main-nav nav-links right flex flex-vert-center flex-space-between">
<li>
<a class="nav-link hide-mobile">Get Started</a>
<ul class="sub-menu bg-white">
<li class="pa-micro">
<a href="/_/cassandra-basics.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-basics.png" alt="cassandra basics icon">
</div>
<div class="sub-nav-text teal py-small">
Cassandra Basics
</div>
</a>
</li>
<li class="pa-micro">
<a href="/_/quickstart.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-rocket.png" alt="cassandra basics icon">
</div>
<div class="sub-nav-text teal py-small">
Quickstart
</div>
</a>
</li>
<li class="pa-micro">
<a href="/_/ecosystem.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-ecosystem.png" alt="cassandra basics icon">
</div>
<div class="sub-nav-text teal py-small">
Ecosystem
</div>
</a>
</li>
</ul>
</li>
<li><a class="nav-link" href="/doc/latest/">Documentation</a></li>
<li>
<a class="nav-link" href="/_/community.html">Community</a>
<ul class="sub-menu bg-white">
<li class="pa-micro">
<a href="/_/community.html#code-of-conduct">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-welcome.png" alt="welcome icon">
</div>
<div class="sub-nav-text teal py-small">
Welcome
</div>
</a>
</li>
<li class="pa-micro hide-mobile">
<a href="/_/community.html#discussions">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-discussions.png" alt="discussions icon">
</div>
<div class="sub-nav-text teal py-small">
Discussions
</div>
</a>
</li>
<li class="pa-micro hide-mobile">
<a href="/_/community.html#project-governance">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-governance.png" alt="Governance icon">
</div>
<div class="sub-nav-text teal py-small">
Governance
</div>
</a>
</li>
<li class="pa-micro hide-mobile">
<a href="/_/community.html#how-to-contribute">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-contribute.png" alt="Contribute icon">
</div>
<div class="sub-nav-text teal py-small">
Contribute
</div>
</a>
</li>
<li class="pa-micro hide-mobile">
<a href="/_/community.html#meet-the-community">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-community.png" alt="Meet the Community icon">
</div>
<div class="sub-nav-text teal py-small">
Meet the Community
</div>
</a>
</li>
<li class="pa-micro hide-mobile">
<a href="/_/cassandra-catalyst-program.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-catalyst.png" alt="Catalyst icon">
</div>
<div class="sub-nav-text teal py-small">
Catalyst Program
</div>
</a>
</li>
<li class="pa-micro hide-mobile">
<a href="/_/events.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-events.png" alt="Events icon">
</div>
<div class="sub-nav-text teal py-small">
Events
</div>
</a>
</li>
</ul>
</li>
<li>
<a class="nav-link hide-mobile">Learn</a>
<ul class="sub-menu bg-white">
<li class="pa-micro">
<a href="/_/Apache-Cassandra-5.0-Moving-Toward-an-AI-Driven-Future.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-basics.png" alt="Basics icon">
</div>
<div class="sub-nav-text teal py-small">
Cassandra 5.0
</div>
</a>
</li>
<li class="pa-micro">
<a href="/_/case-studies.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-case-study.png" alt="Case Studies icon">
</div>
<div class="sub-nav-text teal py-small">
Case Studies
</div>
</a>
</li>
<li class="pa-micro">
<a href="/_/resources.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-resources.png" alt="Resources icon">
</div>
<div class="sub-nav-text teal py-small">
Resources
</div>
</a>
</li>
<li class="pa-micro">
<a href="/_/blog.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-blog.png" alt="Blog icon">
</div>
<div class="sub-nav-text teal py-small">
Blog
</div>
</a>
</li>
</ul>
</li>
<li><a class="nav-link btn btn--filled" href="/_/download.html">Download Now</a></li>
</ul>
</div>
</div>
</header>
<div class="hero hero--home grad">
<div class="eye"></div>
<div id="home-content" class="text-center flex flex-center flex-column relative z2 ma-xlarge">
<h1>The Path to Green CI</h1>
<h3>May, 19 2022 | Josh McKenzie</h3>
</div>
</div>
<div id="blog-post" class="flex-center py-large arrow">
<div class="blog-breadcrumb mb-medium">
<div class="inner inner--narrow">
<a href="/_/blog.html">« Back to the Apache Cassandra Blog</a>
</div>
</div>
<div class="post-content">
<div class="inner inner--narrow">
<div id="preamble">
<div class="sectionbody">
<div class="imageblock">
<div class="content">
<img src="../_images/blog/the-path-to-green-ci-unsplash-hasan-almasi.jpg" alt="the path to Green CI">
</div>
<div class="title">Image credit: <a href="https://unsplash.com/@hasanalmasi" target="_blank" rel="noopener">Hasan Almasi on Unsplash</a></div>
</div>
<div class="paragraph">
<p>As we approach the Cassandra 4.1 GA release, it’s a great time to stop and reflect on some of our development history, the past year since we released 4.0, and where we’re headed in the future. In this blog post, we’re going to limit our focus to testing Cassandra as that’s been a huge focus in the 4.0+ time frame.</p>
</div>
</div>
</div>
<div class="sect2">
<h3 id="the-numbers"><a class="anchor" href="#the-numbers"></a>The Numbers</h3>
<div class="paragraph">
<p>But don’t just take my word for it - let’s start with some numbers! We’re going to use “lines of code” (LoC) as a loose proxy for “where we’re spending our time”. This is definitely a fraught metric, but as <em>one</em> way of viewing things it paints a pretty interesting picture, consistent with many of our intuitions.</p>
</div>
<div class="paragraph">
<p>Below is a list for our past three major releases and how much raw code exists in the following:</p>
</div>
<div class="olist arabic">
<ol class="arabic">
<li>
<p><code>src/java</code>: raw database code</p>
</li>
<li>
<p><code>test/unit</code>: unit testing code</p>
</li>
<li>
<p><code>test/distributed</code>: distributed tests in java code</p>
</li>
<li>
<p>The <code>cassandra-dtest</code> repo .py test files: python distributed tests using <a href="https://github.com/riptano/ccm" target="_blank" rel="noopener">ccm</a></p>
</li>
</ol>
</div>
<table class="tableblock frame-all grid-all stretch">
<colgroup>
<col style="width: 33.3333%;">
<col style="width: 33.3333%;">
<col style="width: 33.3334%;">
</colgroup>
<thead>
<tr>
<th class="tableblock halign-left valign-top">&nbsp;</th>
<th class="tableblock halign-left valign-top">Lines &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;</th>
<th class="tableblock halign-left valign-top">% total</th>
</tr>
</thead>
<tbody>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">src</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">181,871</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">57.51%</p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">junit</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">100,284</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">31.71%</p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">jdtest</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">9,225</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">2.92%</p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">pdtest</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">24,882</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">7.87%</p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>3.0 Total</strong></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>316,262</strong></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">100.00%</p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">All tests</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">134,391</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">42.49%</p></td>
</tr>
</tbody>
</table>
<div class="paragraph">
<p> <br></p>
</div>
<table class="tableblock frame-all grid-all stretch">
<colgroup>
<col style="width: 25%;">
<col style="width: 25%;">
<col style="width: 25%;">
<col style="width: 25%;">
</colgroup>
<thead>
<tr>
<th class="tableblock halign-left valign-top">&nbsp;</th>
<th class="tableblock halign-left valign-top">Lines</th>
<th class="tableblock halign-left valign-top">% total</th>
<th class="tableblock halign-left valign-top">% change since 3.0</th>
</tr>
</thead>
<tbody>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">src</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">261,825</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">52.83%</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">143.96%</p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">junit</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">172,672</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">34.84%</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">172.18%</p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">jdtest</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">21,769</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">4.39%</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">235.98%</p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">pdtest</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">39,300</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">7.93%</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">157.95%</p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>4.0 Total</strong></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>495,566</strong></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">100.00%</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">156.69%</p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">All tests</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">233,741</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">47.17%</p></td>
<td class="tableblock halign-left valign-top"></td>
</tr>
</tbody>
</table>
<div class="paragraph">
<p> <br></p>
</div>
<table class="tableblock frame-all grid-all stretch">
<colgroup>
<col style="width: 25%;">
<col style="width: 25%;">
<col style="width: 25%;">
<col style="width: 25%;">
</colgroup>
<thead>
<tr>
<th class="tableblock halign-left valign-top">&nbsp;</th>
<th class="tableblock halign-left valign-top">Lines</th>
<th class="tableblock halign-left valign-top">% total</th>
<th class="tableblock halign-left valign-top">% change since 4.0</th>
</tr>
</thead>
<tbody>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">src</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">297,685</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">52.58%</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">113.70%</p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">junit</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">197,231</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">34.84%</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">114.22%</p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">jdtest</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">31,306</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">5.53%</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">143.81%</p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">pdtest</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">39,905</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">7.05%</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">101.54%</p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>4.1 Total</strong></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>566,127</strong></p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">100.00%</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">114.24%</p></td>
</tr>
<tr>
<td class="tableblock halign-left valign-top"><p class="tableblock">All tests</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">268,442</p></td>
<td class="tableblock halign-left valign-top"><p class="tableblock">47.42%</p></td>
<td class="tableblock halign-left valign-top"></td>
</tr>
</tbody>
</table>
<div class="paragraph">
<p>The biggest thing that immediately jumps out to me: our in-jvm dtests have been growing at a very strong pace relative to the rest of the code-base. Immense effort has gone into not just authoring this testing <em>framework</em>, but also adding new tests to it. As a percentage of our total code, there was a significant jump from 3.0 to 4.0 of almost a 5% relative increase in total test code to the entire codebase.</p>
</div>
<div class="paragraph">
<p>We can also infer that our new code addition to the database has <em>accelerated</em> in the past year, as the delta from 4.0-4.1 represents a time frame of one calendar year and roughly 38k LoC net add vs. the 5.5 year gap between 3.0 and 4.0 with a net add of ~80k LoC. While the database also had a release line of 3.1-3.11 introducing new features and tests during this time window; for the sake of this analysis we’re only considering major traditional point releases.</p>
</div>
<div class="paragraph">
<p>So what does all this mean for us working on and depending on the project? It means that with the release of 4.1, <em>we have 10% more code just <strong>unit</strong> testing the database than we had in the entire database in 3.0</em>. It means that new development on Cassandra is accelerating. It also means we’re constantly moving the goalposts on what’s required to keep <a href="https://ci-cassandra.apache.org/" target="_blank" rel="noopener">Green (passing) CI (continuous integration)</a>.</p>
</div>
</div>
<div class="sect2">
<h3 id="keeping-it-green"><a class="anchor" href="#keeping-it-green"></a>Keeping it Green</h3>
<div class="paragraph">
<p>We all know Cassandra is an incredibly complex piece of software. The power to scale up linearly to petabytes of data on hundreds of machines, with zero downtime, in a masterless single logical cluster, where machines can drop out and in, <a href="/doc/latest/cassandra/operating/hints.adoc">hint</a>, <a href="/doc/latest/cassandra/operating/read_repair.adoc">heal</a>, and <a href="/doc/latest/cassandra/operating/repair.adoc">repair</a> simply cannot be implemented without a significant amount of code, infrastructure, and testing to ensure it works as expected. Further, as we support hot upgrades with zero downtime between versions with mixed version clusters running, we have a strong commitment to backwards compatibility in the mix.</p>
</div>
<div class="paragraph">
<p>One of our struggles over time has been the software and hardware complexity required to keep our testing infrastructure “clean”, or green, on an ongoing basis. Balancing runtime, resourcing, and cost with a system as complex as Cassandra is a fixed challenge to begin with and is only growing over time as we’ve seen above.</p>
</div>
<div class="paragraph">
<p>We always drive down to stable 0 test failures at a GA release, however what we refer to as “flaky tests” sneak back into our suite over time. Let’s take a recent example, <a href="https://nightlies.apache.org/cassandra/ci-cassandra.apache.org/job/Cassandra-trunk/1112/" target="_blank" rel="noopener">build run 1112</a> on trunk (effectively Cassandra 4.1 pre-alpha).</p>
</div>
<div class="paragraph">
<p>19 test failures! <a href="https://nightlies.apache.org/cassandra/ci-cassandra.apache.org/job/Cassandra-trunk/1112/testReport/" target="_blank" rel="noopener">Out of an entire suite of 49,704 tests makes that a 99.96% pass rate</a>. Nobody wants a database that works 99.96% of the time, however, and that’s assuming we have 100% test coverage of not just all our code but also all possible combinations of state, a problem so daunting some contributors are <a href="https://issues.apache.org/jira/browse/CASSANDRA-15348" target="_blank" rel="noopener">pushing the bleeding edge of the state of the art of distributed database testing</a>.</p>
</div>
<div class="paragraph">
<p>Burning down less than 20 flaky tests to get our release out between freeze and our goal for release, an eight-week window, is quite doable, so why not just continue to float along with a low number of test failures? Well, it gets more complicated when we look at <em>where</em> we run our tests.</p>
</div>
</div>
<div class="sect2">
<h3 id="circle-vs-jenkins"><a class="anchor" href="#circle-vs-jenkins"></a>Circle vs. Jenkins</h3>
<div class="paragraph">
<p>As <a href="../development/testing.html" class="page">we outline in our contributor guide on testing</a>, tests can both be run on <a href="https://ci-cassandra.apache.org/" target="_blank" rel="noopener">Apache Jenkins infrastructure</a> or on <a href="https://github.com/apache/cassandra/tree/cassandra-4.1/.circleci" target="_blank" rel="noopener">CircleCI</a>. The primary difference between these two systems are runtime, cost, and resources allocated to each individual test. While some contributors have access to paid CircleCI accounts that allow them to dedicate more resources to their test runs and shorten feedback loops, this is an open source volunteer project and our canonical CI is the Apache Jenkins infrastructure.</p>
</div>
<div class="paragraph">
<p>One challenge this introduces is tests that “flake” due to resource allocation differences. For instance, if you allocate a particularly intensive unit test to eight cores in a container with 16 gigs of RAM, you can expect a different runtime than allocating a container with two cores and eight gigs of RAM. Throw into the mix that all of us are doing development on different laptops, with different core counts, with different <em>architectures</em>, and you have a recipe for some pretty challenging non-deterministic test runtimes.</p>
</div>
<div class="paragraph">
<p><a href="https://cwiki.apache.org/confluence/x/1AorCQ" target="_blank" rel="noopener">Currently we accept both a Green run on CircleCI and a Green run on DevBranch on jenkins as acceptable for committers to merge code</a>. This introduces a gap for us as the Circle plan can allocate more resources to containers for running tests based on the plan you use, meaning a test could pass on Circle that subsequently fails on ASF Jenkins due to resourcing limitations.</p>
</div>
<div class="paragraph">
<p>Another challenge we face is that it can be challenging to author tests in Cassandra that have deterministic results in the face of scheduling pressures. Given the long legacy of our project (which dates back to 2008), we have quite a bit of static state without existing stub implementations for testing, meaning many of our unit tests spin up state in other areas of the database, write files to disk, and otherwise mutate state in adjacent subsystems. <a href="https://github.com/apache/cassandra/blob/cassandra-4.1/.circleci/generate.sh#L41-L49" target="_blank" rel="noopener">What this translates into is the need to run new tests 100 times on CI infrastructure before committing them, a practice we haven’t yet enshrined into our process</a>.</p>
</div>
</div>
<div class="sect2">
<h3 id="testing-complex-systems-is-itself-complex"><a class="anchor" href="#testing-complex-systems-is-itself-complex"></a>Testing Complex Systems is Itself Complex</h3>
<div class="paragraph">
<p>The Cassandra testing ecosystem consists of a variety of different suites targeting different subsystems and operations in the database. From a high level, a look at the <a href="https://ci-cassandra.apache.org/job/Cassandra-trunk/" target="_blank" rel="noopener">top level testing pipelines of the project</a> shows standouts like testing with <a href="https://issues.apache.org/jira/browse/CASSANDRA-6809" target="_blank" rel="noopener">compression</a>, with <a href="https://issues.apache.org/jira/browse/CASSANDRA-8844" target="_blank" rel="noopener">change-data-capture</a> enabled, during upgrades, both unit vs. distributed, etc. We have a <a href="https://ci-cassandra.apache.org/computer/" target="_blank" rel="noopener">cluster of machines dedicated</a> to testing Cassandra and tending to their needs is a significant task, all of which are <a href="https://github.com/apache/cassandra-builds/blob/trunk/ASF-jenkins-agents.md#current-agents" target="_blank" rel="noopener">donated by different participants</a> within the Apache Cassandra ecosystem.</p>
</div>
<div class="paragraph">
<p>Taking a quick look at the <a href="https://ci-cassandra.apache.org/job/Cassandra-trunk/1112/flowGraphTable/" target="_blank" rel="noopener">runtime pipeline under the hood</a>, you can see the large distributed effort that it is to break down the different jobs across these different agents. <a href="https://github.com/apache/cassandra-builds/blob/trunk/jenkins-dsl/cassandra_job_dsl_seed.groovy" target="_blank" rel="noopener">The code required to generate, distribute, build, collect logs from, teardown, and maintain</a> all these jobs on these machines lives in the <a href="https://github.com/apache/cassandra-builds" target="_blank" rel="noopener">cassandra-builds repo</a> inside apache on github.</p>
</div>
<div class="paragraph">
<p>Throwing all this hardware and parallelization at our almost 50,000 tests takes our total test runtime <strong>down to 4h 9m 4s</strong>. A big shout-out to Mick Semb Wever, committer and PMC member on the project, who’s done a ton of work to get us this far with our CI infrastructure!</p>
</div>
<div class="paragraph">
<p>We have a few ideas for ways to reduce the total processing burden of our tests; with this much compute required and this many tests, small percentages add up to big gains. Jacek Lewandowski is targeting some file operations and general speedup in <a href="https://issues.apache.org/jira/browse/CASSANDRA-17427" target="_blank" rel="noopener">CASSANDRA-17427</a>, Berenguer Blasi is looking into potentially re-using dtest clusters in our python dtests to cut out unnecessary cluster startup and shutdown times in <a href="https://issues.apache.org/jira/browse/CASSANDRA-16951" target="_blank" rel="noopener">CASSANDRA-16951</a>, and after a little analysis I’ve uncovered that roughly 20% of our unit test runtime is comprised of 2.62% of our tests, giving us some low hanging fruit to potentially target to speed things up in <a href="https://issues.apache.org/jira/browse/CASSANDRA-17371" target="_blank" rel="noopener">CASSANDRA-17371</a>.</p>
</div>
<div class="paragraph">
<p>Lastly, we have a Jenkins to JIRA integration script drafted that would auto update tickets with the results of the CI runs on ASF Jenkins infrastructure with the results of their build in <a href="https://issues.apache.org/jira/browse/CASSANDRA-17277?focusedCommentId=17493385&amp;page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17493385" target="_blank" rel="noopener">CASSANDRA-17277</a>. This is necessary as we have two paths for code to get certified for inclusion (circle or ASF Jenkins) with the former being more heavily resourced than the latter, but the latter being our gatekeeper.</p>
</div>
</div>
<div class="sect2">
<h3 id="the-future-of-testing-in-cassandra"><a class="anchor" href="#the-future-of-testing-in-cassandra"></a>The Future of Testing in Cassandra</h3>
<div class="paragraph">
<p>As we head into the verification cycle for Cassandra 4.1 we’re going to be using <a href="https://cwiki.apache.org/confluence/x/tQzjBw" target="_blank" rel="noopener">the same Release Lifecycle definitions</a> we ratified back in 2019. Of note, we won’t transition from alpha to beta without green tests: <em>“No flaky tests - All tests (Unit Tests and DTests) should pass consistently. A failing test, upon analyzing the root cause of failure, may be “ignored in exceptional cases”, if appropriate, for the release, after discussion in the dev mailing list.”</em></p>
</div>
<div class="paragraph">
<p>So we’re going to drive back to a green test board as we do for each major release, but are we going to make an effort to stay there and if so, how?</p>
</div>
<div class="paragraph">
<p>I’ve been working on this project since early 2014 (!), and this has always been a challenge for us. That said, after analyzing the numbers for this blog post and realizing <em>just how much</em> we’re proportionally expanding our <em>testing</em>, I’m heartened by the progress we’re making; the proportion of flaky or failing tests is objectively falling over time. A total of 15 failing tests out of 50,000 is a lot less than 15 failing out of 25,000, or 12,500 for example, so we’re definitely moving in the right direction.</p>
</div>
<div class="paragraph">
<p>If we take the value of having a green test board as self-evident (developer time, triaging, branch stability, feedback loops, etc), how can we stay there after the 4.1 release? The combination of a bot letting us know ASAP if our patch correlates with a new test failure should help, as will lowering the total runtime required between running our tests and merging them. Lastly, in January of 2022 we introduced a new <a href="https://cwiki.apache.org/confluence/x/DI3kCw" target="_blank" rel="noopener">Build Lead</a> role to shepherd integration with our CI tracking system <a href="https://butler.cassandra.apache.org/#/" target="_blank" rel="noopener">Butler</a> which has had a very positive impact on our visibility of and momentum on fixing test failures.</p>
</div>
<div class="paragraph">
<p>We have a balanced tension between wanting to get code changes into the system rapidly for contributors fortunate enough to be able to use CircleCI while also providing for and encouraging usage of the freely available Apache Jenkins infrastructure, but we’re bridging the gap this naturally creates.</p>
</div>
<div class="paragraph">
<p>Contributors around the globe are working hard to get Cassandra 4.1 GA soon and just like Cassandra 4.0 before it, we expect this to be the most stable, best performing version of Apache Cassandra we’ve ever released. You can download the test build of Cassandra 4.1 <a href="https://nightlies.apache.org/cassandra/cassandra-4.1/Cassandra-4.1-artifacts/23/Cassandra-4.1-artifacts/" target="_blank" rel="noopener">here</a> and test it out - let us know what you think!</p>
</div>
<div class="paragraph">
<p>If you haven’t yet, come join the <a href="../community.html" class="page">Cassandra development community</a> and get involved in making the most scalable and available database in the world!</p>
</div>
</div>
</div>
</div>
</div>
<footer class="grad grad--two flex-center pb-xlarge">
<div class="inner text-center z2 relative">
<h2 class="white py-small">Get started with Cassandra, fast.</h2>
<a id="footer-cta" href="/_/quickstart.html" class="btn btn--filled ma-medium">Quickstart Guide</a>
</div>
<div class="inner flex flex-distribute-items mt-xlarge z2 relative">
<div class="col-2">
<div id="footer-logo" class="logo logo--footer mb-medium"><img src="../../assets/img/logo-white-r.png" alt="Cassandra Logo"></div>
<p>Apache Cassandra<img src="../../assets/img/registered.svg" alt="®" style="width:18px;"> powers mission-critical deployments with improved performance and unparalleled levels of scale in the cloud.</p>
<div class="footer-social-icons">
<a href="https://twitter.com/cassandra?lang=en" target="_blank"><img src="../../assets/img/twitter-icon-circle-white.svg" alt="twitter icon" width="24"></a>
<a href="https://www.linkedin.com/company/apache-cassandra/" target="_blank"><img src="../../assets/img/LI-In-Bug.png" alt="linked-in icon" width="24"></a>
<a href="https://www.youtube.com/c/PlanetCassandra" target="_blank"><img src="../../assets/img/youtube-icon.png" alt="youtube icon" width="24"></a>
</div>
</div>
<div class="col-2 flex flex-center">
<ul class="columns-2">
<li class="mb-small"><a href="/">Home</a></li>
<li class="mb-small"><a href="/_/cassandra-basics.html">Cassandra Basics</a></li>
<li class="mb-small"><a href="/_/quickstart.html">Quickstart</a></li>
<li class="mb-small"><a href="/_/ecosystem.html">Ecosystem</a></li>
<li class="mb-small"><a href="/doc/latest/">Documentation</a></li>
<li class="mb-small"><a href="/_/community.html">Community</a></li>
<li class="mb-small"><a href="/_/case-studies.html">Case Studies</a></li>
<li class="mb-small"><a href="/_/resources.html">Resources</a></li>
<li class="mb-small"><a href="/_/blog.html">Blog</a></li>
</ul>
</div>
</div>
</footer>
<div class="lower-footer bg-white pa-medium">
<div class="flex flex-row flex-vert-center">
<div class="pr-medium"><img src="../../assets/img//feather-small.png" alt="ASF" width="20"></div>
<div class="pr-medium"><a href="http://www.apache.org/" target="_blank">Foundation</a></div>
<div class="pr-medium"><a href="https://www.apache.org/events/current-event.html" target="_blank">Events</a></div>
<div class="pr-medium"><a href="https://www.apache.org/licenses/" target="_blank">License</a></div>
<div class="pr-medium"><a href="https://www.apache.org/foundation/thanks" target="_blank">Thanks</a></div>
<div class="pr-medium"><a href="https://www.apache.org/security" target="_blank">Security</a></div>
<div class="pr-medium"><a href="https://privacy.apache.org/policies/privacy-policy-public.html" target="_blank">Privacy</a></div>
<div class="pr-medium"><a href="https://www.apache.org/foundation/sponsorship" target="_blank">Sponsorship</a></div>
</div>
<p class="my-medium">© 2009-<script>document.write(new Date().getFullYear())</script> <a href="https://apache.org" target="_blank">The Apache Software Foundation</a> under the terms of the Apache License 2.0. Apache, the Apache feather logo, Apache Cassandra, Cassandra, and the Cassandra logo, are either registered trademarks or trademarks of The Apache Software Foundation.</p>
</div>
<div id="fade" class="hidden"></div>
<div id="modal" class="hidden">
<div id="close-modal" class="cursor-pointer"><svg viewBox="0 0 24 24" width="24" height="24" stroke="currentColor" stroke-width="2" fill="none" stroke-linecap="round" stroke-linejoin="round" class="css-i6dzq1"><line x1="18" y1="6" x2="6" y2="18"></line><line x1="6" y1="6" x2="18" y2="18"></line></svg></div>
<div id="mod-content" class="vid-mod-content resp-container"></div>
</div>
<script>
jQuery(function(){
var windowW = $(window).width();
$(document)
.on('click','.mobile-nav-icon',function(){
$('.main-nav').fadeIn();
})
.on('click','.main-nav',function(){
if(windowW <= 1000){
$(this).fadeOut();
}
})
.on('click','#version-toggle',function(){
$(this).toggleClass('active');
$(this).next().fadeToggle();
})
.on('click','#mobile-docs-nav-burger', function(){
$(this).toggleClass('active');
$('.docs-nav').toggleClass('active');
});
var url = window.location.pathname;
var isQuickstart = url.includes('quickstart.html');
if(isQuickstart){
var footerCTA = document.getElementById('footer-cta');
footerCTA.innerHTML = 'Get latest updates';
footerCTA.setAttribute('href', '/_/blog.html');
}
});
</script>
</div>
</body>
<script>
jQuery(function(){
});
</script>
</html>