blob: 100d57b6d058a54e3ecf2715fd34faea4e727ca2 [file] [log] [blame]
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1.0">
<title>Apache Cassandra | Apache Cassandra Documentation</title>
<link rel="stylesheet" href="../../assets/css/site.css">
<meta name="description" content="The Apache Cassandra Community">
<link rel="schema.dcterms" href="https://purl.org/dc/terms/">
<meta name="dcterms.subject" content="_">
<meta name="dcterms.identifier" content="master">
<meta name="generator" content="Antora 2.3.4">
<link rel="icon" href="../../assets/img/favicon.ico" type="image/x-icon">
<script>
const script = document.createElement("script");
const domain = window.location.hostname;
script.type = "text/javascript";
script.src = "https://plausible.cassandra.apache.org/js/plausible.js";
script.setAttribute("data-domain",domain);
script.setAttribute("defer",'true');
script.setAttribute("async",'true');
document.getElementsByTagName("head")[0].appendChild(script);
</script> </head>
<body class="single-post">
<div class="container mx-auto relative">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.6.0/jquery.min.js"></script>
<meta property="og:type" content="website" />
<meta property="og:description" content="" />
<meta property="og:url" content="/" />
<meta property="og:site_name" content="Apache Cassandra" />
<header id="top-nav">
<div class="inner relative">
<div class="header-social-icons text-right">
<a href="https://twitter.com/cassandra?lang=en" target="_blank" styles="margin-left: 20px;"><img src="../../assets/img/twitter-icon-circle-white.svg" alt="twitter icon" width="24"></a>
<a href="https://www.linkedin.com/company/apache-cassandra/" target="_blank" styles="margin-left: 20px;"><img src="../../assets/img/LI-In-Bug.png" alt="linked-in icon" width="24"></a>
<a href="https://www.youtube.com/c/PlanetCassandra" target="_blank" styles="margin-left: 20px;"><img src="../../assets/img/youtube-icon.png" alt="youtube icon" width="24"></a>
</div>
<div class="cf">
<div class="logo left"><a href="/"><img src="../../assets/img/logo-white-r.png" alt="cassandra logo"></a></div>
<div class="mobile-nav-icon right">
<img class="toggle-icon" src="../../assets/img/hamburger-nav.svg">
</div>
<ul class="main-nav nav-links right flex flex-vert-center flex-space-between">
<li>
<a class="nav-link hide-mobile">Get Started</a>
<ul class="sub-menu bg-white">
<li class="pa-micro">
<a href="/_/cassandra-basics.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-basics.png" alt="cassandra basics icon">
</div>
<div class="sub-nav-text teal py-small">
Cassandra Basics
</div>
</a>
</li>
<li class="pa-micro">
<a href="/_/quickstart.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-rocket.png" alt="cassandra basics icon">
</div>
<div class="sub-nav-text teal py-small">
Quickstart
</div>
</a>
</li>
<li class="pa-micro">
<a href="/_/ecosystem.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-ecosystem.png" alt="cassandra basics icon">
</div>
<div class="sub-nav-text teal py-small">
Ecosystem
</div>
</a>
</li>
</ul>
</li>
<li><a class="nav-link" href="/doc/latest/">Documentation</a></li>
<li>
<a class="nav-link" href="/_/community.html">Community</a>
<ul class="sub-menu bg-white">
<li class="pa-micro">
<a href="/_/community.html#code-of-conduct">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-welcome.png" alt="welcome icon">
</div>
<div class="sub-nav-text teal py-small">
Welcome
</div>
</a>
</li>
<li class="pa-micro hide-mobile">
<a href="/_/community.html#discussions">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-discussions.png" alt="discussions icon">
</div>
<div class="sub-nav-text teal py-small">
Discussions
</div>
</a>
</li>
<li class="pa-micro hide-mobile">
<a href="/_/community.html#project-governance">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-governance.png" alt="Governance icon">
</div>
<div class="sub-nav-text teal py-small">
Governance
</div>
</a>
</li>
<li class="pa-micro hide-mobile">
<a href="/_/community.html#how-to-contribute">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-contribute.png" alt="Contribute icon">
</div>
<div class="sub-nav-text teal py-small">
Contribute
</div>
</a>
</li>
<li class="pa-micro hide-mobile">
<a href="/_/community.html#meet-the-community">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-community.png" alt="Meet the Community icon">
</div>
<div class="sub-nav-text teal py-small">
Meet the Community
</div>
</a>
</li>
<li class="pa-micro hide-mobile">
<a href="/_/cassandra-catalyst-program.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-catalyst.png" alt="Catalyst icon">
</div>
<div class="sub-nav-text teal py-small">
Catalyst Program
</div>
</a>
</li>
<li class="pa-micro hide-mobile">
<a href="/_/events.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-events.png" alt="Events icon">
</div>
<div class="sub-nav-text teal py-small">
Events
</div>
</a>
</li>
</ul>
</li>
<li>
<a class="nav-link hide-mobile">Learn</a>
<ul class="sub-menu bg-white">
<li class="pa-micro">
<a href="/_/Apache-Cassandra-5.0-Moving-Toward-an-AI-Driven-Future.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-basics.png" alt="Basics icon">
</div>
<div class="sub-nav-text teal py-small">
Cassandra 5.0
</div>
</a>
</li>
<li class="pa-micro">
<a href="/_/case-studies.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-case-study.png" alt="Case Studies icon">
</div>
<div class="sub-nav-text teal py-small">
Case Studies
</div>
</a>
</li>
<li class="pa-micro">
<a href="/_/resources.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-resources.png" alt="Resources icon">
</div>
<div class="sub-nav-text teal py-small">
Resources
</div>
</a>
</li>
<li class="pa-micro">
<a href="/_/blog.html">
<div class="sub-nav-icon">
<img src="../../assets/img/sub-menu-blog.png" alt="Blog icon">
</div>
<div class="sub-nav-text teal py-small">
Blog
</div>
</a>
</li>
</ul>
</li>
<li><a class="nav-link btn btn--filled" href="/_/download.html">Download Now</a></li>
</ul>
</div>
</div>
</header>
<div class="hero hero--home grad">
<div class="eye"></div>
<div id="home-content" class="text-center flex flex-center flex-column relative z2 ma-xlarge">
<h1>Harry, an Open Source Fuzz Testing and Verification Tool for Apache Cassandra</h1>
<h3>November 18, 2021 | Alex Petrov</h3>
</div>
</div>
<div id="blog-post" class="flex-center py-large arrow">
<div class="blog-breadcrumb mb-medium">
<div class="inner inner--narrow">
<a href="/_/blog.html">« Back to the Apache Cassandra Blog</a>
</div>
</div>
<div class="post-content">
<div class="inner inner--narrow">
<div id="preamble">
<div class="sectionbody">
<div class="paragraph">
<p>Over the years working on Apache Cassandra while writing tests or
trying to reproduce the issues, I’ve always found myself repeating the
same procedure over and over again: creating schema, writing loops
generating data, then either manually reconciling it to check the
results, or comparing the result set against some predetermined
expected result. Not only is this approach tedious and time-consuming,
but it also does not scale: if some set of operations work for one
schema, there’s no way to know if it will also work for any arbitrary
schema, whether it will work if operations are executed in a different
order, or if operations themselves are slightly different.</p>
</div>
<div class="paragraph">
<p>While preparing Apache Cassandra for 4.0 release, we’ve made extensive
progress in how we test. The new in-tree in-JVM distributed test
framework enables us to easily write tests that exercise coordinated
query execution code paths while giving us flexibility and control
that was previously offered only by CQLTester, a tool for exercising
node-local query paths. Many subsystems were audited and covered with
tests. Cassandra users tried the new version out in their clusters and
reported their findings. All of these things are useful and important,
but we still needed a tool that would give us the same or higher
degree of confidence for every commit so that we could know that the
database is working as expected, not only for an exact set of
operations that exercised by unit and integration tests, but
potentially for any use-case and combination of operations under
circumstances comparable to production.</p>
</div>
<div class="paragraph">
<p>This all led us to develop Harry, a tool that can combine properties
of stress- and integration-testing tools. Harry is a tool that can
generate data for an arbitrary schema, execute data modification
queries against the cluster, track the progress of operation
execution, and make sure that responses to read queries are correct.</p>
</div>
<div class="paragraph">
<p>After reading this post, you will understand:</p>
</div>
<div class="ulist">
<ul>
<li>
<p>how Harry generates the data</p>
</li>
<li>
<p>how Harry performs verification</p>
</li>
<li>
<p>which properties values generated by Harry make verification not only possible but also efficient.</p>
</li>
</ul>
</div>
<div class="paragraph">
<p>The primary audience for this post is Cassandra contributors, so you
will need to be familiar with Apache Cassandra and its tooling.</p>
</div>
</div>
</div>
<div class="sect1">
<h2 id="fuzz-testing"><a class="anchor" href="#fuzz-testing"></a>Fuzz testing</h2>
<div class="sectionbody">
<div class="paragraph">
<p>Since most of the bugs are reproduced by taking a sequence of actions
following some pattern, we need to specify what actions can be used to
lead to a given state. However, there’s a lot of flexibility regarding
which values exactly are written in specific columns.</p>
</div>
<div class="paragraph">
<p>For example, if we look at
<a href="https://issues.apache.org/jira/browse/CASSANDRA-16453" target="_blank" rel="noopener">CASSANDRA-16453</a>,
which was reproduced using Harry. Code to reproduce the issue with
in-JVM DTests looks something like this:</p>
</div>
<div class="listingblock">
<div class="title">Repro.java</div>
<div class="content">
<pre class="highlightjs highlight"><code class="language-java hljs" data-lang="java">try (Cluster cluster = init(builder().withNodes(2).start()))
{
cluster.schemaChange(withKeyspace("CREATE TABLE distributed_test_keyspace.table_0 (pk0 bigint,ck0 bigint,regular0 bigint,regular1 bigint,regular2 bigint, PRIMARY KEY (pk0, ck0)) WITH CLUSTERING ORDER BY (ck0 ASC);"));
cluster.coordinator(1).execute("DELETE FROM distributed_test_keyspace.table_0 USING TIMESTAMP 1 WHERE pk0=1 AND ck0&gt;2;", ConsistencyLevel.ALL);
cluster.get(2).executeInternal("DELETE FROM distributed_test_keyspace.table_0 USING TIMESTAMP 1 WHERE pk0=1;");
cluster.coordinator(1).execute("SELECT * FROM distributed_test_keyspace.table_0 WHERE pk0=1 AND ck0&gt;=1 AND ck0&lt;3;",
ConsistencyLevel.ALL, 1L, 1L, 3L);
}</code></pre>
</div>
</div>
<div class="paragraph">
<p>You can see that, at first glance, there are only three things that
that are important to reproduce the issue:</p>
</div>
<div class="olist arabic">
<ol class="arabic">
<li>
<p>The table has to have at least one clustering column</p>
</li>
<li>
<p>Two actions are executed against the cluster: a range deletion, and a partition deletion</p>
</li>
<li>
<p>Both operations have the same timestamp</p>
</li>
</ol>
</div>
<div class="paragraph">
<p>The rest of the details do not matter: size of the cluster, number of
replicas, clustering order, consistency level with which operations
are executed, types of clustering keys and values written, and so on.</p>
</div>
<div class="paragraph">
<p>The simplest way to cover a case like this with a test is to hardcode
the schema and then execute a partition deletion and a range deletion
hardcoding the values, much as we did above. This might work, but
there’s still a chance that the proposed fix may not work for some
other schema or some combination of values.</p>
</div>
<div class="paragraph">
<p>To improve the situation, we can express the test in more abstract
terms and, instead of writing a repro using specific statements, we
can only use the constraints we’ve specified above:</p>
</div>
<div class="listingblock">
<div class="title">HarryDsl.java</div>
<div class="content">
<pre class="highlightjs highlight"><code class="language-java hljs" data-lang="java">test(new SchemaGenerators.Builder("harry")
.partitionKeySpec(1, 5)
.clusteringKeySpec(1, 5)
.regularColumnSpec(1, 10)
.generator(),
historyBuilder -&gt; {
historyBuilder.nextPartition()
.simultaneously()
.randomOrder()
.partitionDeletion()
.rangeDeletion()
.finish();
});</code></pre>
</div>
</div>
<div class="paragraph">
<p>This spec can be used to generate clusters of different sizes,
configured with different schemas, executing the given sequence of
actions both in isolation and combined with other randomly generated
ones, with failure-injection. Best of all, this test will <em>not only</em>
ensure that such a sequence of actions does not produce an exception
but also ensures that a cluster will respond with correct results to
<em>any</em> allowed read query.</p>
</div>
</div>
</div>
<div class="sect1">
<h2 id="generating-data"><a class="anchor" href="#generating-data"></a>Generating data</h2>
<div class="sectionbody">
<div class="paragraph">
<p>Generating random values and sequences of actions and reconciling them
during verification is in itself not a difficult task. Making this
process time- and memory-efficient is what makes it more interesting.</p>
</div>
<div class="paragraph">
<p>For space efficiency, the log of actions generated using Harry is not
kept in memory or saved anywhere on disk since any generated operation
can be reproduced from its sequence number. In Harry, a sequence
number consists of two parts: the logical timestamp (LTS, which has
1-1 mapping to real-time timestamp), and the modification ID, which
allows having multiple uniquely identifiable operations for each
logical timestamp. For the sake of simplicity, we’ll just say that
each operation is represented by its sequence number / LTS.</p>
</div>
<div class="paragraph">
<p>In the example above, the operation order is determined by the seed
for the given run. Let’s say that partition deletion is executed
first. To produce a <code>DELETE</code> statement from it, we now need to
generate a partition key and get a timestamp. Similarly, to generate a
range deletion, we will need a partition key, two clustering keys that
will serve as lower and higher bounds for the range tombstone, and a
timestamp.</p>
</div>
<div class="paragraph">
<p>Using the sequence number and knowing the operation type, we can now
produce <em>descriptors</em> that are used as the compact internal
representation of data in Harry. No matter how many parts it consists
of, any partition key is represented by a single <code>long</code>. The same is
true for the clustering keys: any clustering key, single-part or
composite, is represented using a single <code>long</code> descriptor. If we were
to generate an <code>INSERT</code> or <code>UPDATE</code> operation, each value for a
regular or a static column would have its own descriptor since we
would want to distinguish between two writes made by two different
operations.</p>
</div>
<div class="paragraph">
<p>To summarise, every operation has a sequence number, which determines
everything that is required to fully reproduce this operation,
including descriptors that we will later use to generate values
themselves:</p>
</div>
<div class="ulist">
<ul>
<li>
<p>partition deletion only has a partition descriptor</p>
</li>
<li>
<p>range deletion has a partition descriptor and two clustering descriptors, specifying tombstone bounds</p>
</li>
<li>
<p>insert or update operation has a partition descriptor, a clustering descriptor, and a set of value descriptors, one for each regular and static column.</p>
</li>
</ul>
</div>
<div class="paragraph">
<p>Using descriptors rather than specific values for verification can be
extremely useful for efficiency. Instead of comparing potentially
large values, we could just compare two longs that uniquely identify
them. This means that we have to have a way to not <em>only</em> generate a
value from the descriptor, but <em>also</em> to compute a descriptor the
value was generated from.</p>
</div>
<div class="paragraph">
<p>In Harry, we call such a generator <code>Bijection&lt;T&gt;</code>, and every bijection
can <em>inflate</em> a descriptor into the value of type <code>T</code>. Then <em>deflate</em>
the value of type <code>T</code> back into the descriptor where it was originally
generated.</p>
</div>
</div>
</div>
<div class="sect1">
<h2 id="validating-results"><a class="anchor" href="#validating-results"></a>Validating results</h2>
<div class="sectionbody">
<div class="paragraph">
<p>Applying a predetermined sequence of operations against a single
partition produces some partition state. Knowing the status of
execution of each operation, we can deterministically determine the
state of each node in the cluster and validate the results of
execution of any <code>SELECT</code> query.</p>
</div>
<div class="paragraph">
<p>Since we can represent any operation as a sequence of descriptors, we
know the order of operations (since the timestamp determines it). We
can assume we know the status of each operation (whether or not it has
been executed against some node), and we can deterministically produce
partition state for any given point in time. Partition state is
nothing but a sorted map, where the key is a clustering descriptor,
and value is a row state. Row state, in this case, holds value
descriptors for each column, and timestamps where operations were
executed:</p>
</div>
<div class="listingblock">
<div class="title">PartitionState.java</div>
<div class="content">
<pre class="highlightjs highlight"><code class="language-java hljs" data-lang="java">public class PartitionState implements Iterable&lt;RowState&gt; {
long partitionDescriptor;
NavigableMap&lt;Long, RowState&gt; rowStates;
}
public static class RowState {
long[] valueDescriptors;
long[] logicalTimestamps;
}</code></pre>
</div>
</div>
<div class="paragraph">
<p>Similarly, since any value written to the database is generated using
a bijection, we can produce the partition state from the result set by
deflating every value returned by the database into the descriptor
that it was generated from.</p>
</div>
</div>
</div>
<div class="sect1">
<h2 id="generating-descriptors"><a class="anchor" href="#generating-descriptors"></a>Generating Descriptors</h2>
<div class="sectionbody">
<div class="paragraph">
<p>Reproducible operation sequences can be generated from a set of rules
that determines what the sequence is going to look like. For example,
we can specify probability distributions for each operation type or
give operations relative weights, which can be turned into the
distribution internally later. Configuration for an insert / update /
delete workload with a probability of an insert operation (100/251)
being twice as high as a probability of a row deletion (50/251), and
ten times more probable than a partition deletion (1/251), would look
like:</p>
</div>
<div class="listingblock">
<div class="content">
<pre>INSERT: 100
UPDATE: 100
DELETE_ROW: 50
DELETE_PARTITION: 1</pre>
</div>
</div>
<div class="paragraph">
<p>Since each operation is uniquely determined by its sequence number, we
can deterministically compute its operation type by taking these
probability distributions. One way to do this is by using PCG random
number generator, which has some useful properties we’re going to use
for generating our pseudorandom values.</p>
</div>
<div class="paragraph">
<p>If you’d like to learn more about the mathematical underpinnings of
PCG, you should read this paper
(<a href="https://www.pcg-random.org/pdf/hmc-cs-2014-0905.pdf" target="_blank" rel="noopener">https://www.pcg-random.org/pdf/hmc-cs-2014-0905.pdf</a>). However,
to be able to use PCG, it is not necessary to know any of the
internals. We need a random number generator that will have the
following properties:</p>
</div>
<div class="ulist">
<ul>
<li>
<p>Long period: sequence of numbers it produces does not repeat
frequently; ideally the period should be 2^64 when generating a
random number from 64 bits of entropy</p>
</li>
<li>
<p>Stream selection: the ability to produce different random
sequences from the same seed, identified by some stream id.</p>
</li>
<li>
<p>Addressability: any number produced by the generator can be
reproduced from the seed and its sequence number. Ideally, we’d
like to have methods such as <code>long randomNumber(long
sequenceNumber, long stream)</code> and <code>long sequenceNumber(long
randomNumber, long stream)</code>. In other words, we should be able to
determine the sequence number of the random number in the given
stream. Using this method, we can also determine <code>distance(long x,
long y)</code> : how many random numbers we should skip to get <code>y</code> after
seeing <code>x</code>.</p>
</li>
<li>
<p>Walkability: the ability to produce a number immediately following
<code>long next(long randomNumber, long stream)</code> or preceding <code>long
prev(long randomNumber, long stream)</code> the given random number in
the random sequence.</p>
</li>
</ul>
</div>
<div class="paragraph">
<p>You might have noticed that there are two ways to achieve the same
thing. We can get a pseudorandom number from some number known by the
system by using <code>randomNumber(i, stream)</code> and by using <code>prev(i,
stream)</code>. Both variants are valid, and both operations can be
inverted. We have a slight preference toward using <code>prev</code>, since its
inverse can be computed in constant time.</p>
</div>
<div class="paragraph">
<p>These properties allow us to reproduce partition state from just
configuration (i.e., known distributions, schema, size of the
partition, etc) and a seed:</p>
</div>
<div class="ulist">
<ul>
<li>
<p>Partition descriptor for <code>N</code> th operation can be picked as <code>M</code> th
random number in the stream of partition descriptors, and the
relation between <code>N</code> and <code>M</code> is determined by the chosen pattern
for visiting partitions.</p>
</li>
<li>
<p>Clustering descriptor for <code>N</code> th operation can be picked as <code>M</code> th
random number in the stream of clustering descriptors <strong>for the
given partition</strong>, where maximum <code>M</code> is determined by the maximum
partition size, so there can be no more than <code>max(M)</code> rows in any
generated partition.</p>
</li>
</ul>
</div>
<div class="paragraph">
<p>One of the simplest useful ways to represent a pattern for picking a
descriptor from the sequence is to use a sliding window. The sliding
window begins with a preset number of items in it and allows to visit
each item in the current window one or several times in a round-robin
fashion. After this, it cycles one of the items out and adds a new one
in its place.</p>
</div>
<div class="paragraph">
<p>Once operation type, partition descriptor, and clustering descriptors
are determined, all we have left to cover is how to generate value
descriptors for <code>INSERT</code> and <code>UPDATE</code> operations. Value descriptor for
a column is uniquely identified by its sequence number and is bound by
partition descriptor, clustering descriptor, and column.</p>
</div>
<div class="paragraph">
<p>To summarise, all operations in Harry are deterministic and are
represented using their descriptors. Descriptors can be computed
hierarchically using the following rules:</p>
</div>
<div class="ulist">
<ul>
<li>
<p>Partition descriptor is picked from the <em>stream</em> of partition descriptors. Its position in that stream is determined by some rule (for example, a sliding window):</p>
</li>
</ul>
</div>
<div class="listingblock">
<div class="content">
<pre class="highlightjs highlight"><code class="language-java hljs" data-lang="java">long pd = rng.randomNumber(positionFor(sequenceNumber), PARTITION_DESCRIPTOR_STREAM_ID)</code></pre>
</div>
</div>
<div class="ulist">
<ul>
<li>
<p>Clustering descriptor is picked from the <em>stream</em> of clustering descriptors <strong>for the given partition</strong>.</p>
</li>
</ul>
</div>
<div class="listingblock">
<div class="content">
<pre class="highlightjs highlight"><code class="language-java hljs" data-lang="java">long cd = rng.prev(positionInPartition, pd);</code></pre>
</div>
</div>
<div class="ulist">
<ul>
<li>
<p>Value descriptor is picked from the <em>stream</em> of descriptors identified by which partition, clustering, and column the value belongs to:</p>
</li>
</ul>
</div>
<div class="listingblock">
<div class="content">
<pre class="highlightjs highlight"><code class="language-java hljs" data-lang="java">long vd = rng.randomNumber(sequenceNumber, pd ^ cd ^ col);</code></pre>
</div>
</div>
</div>
</div>
<div class="sect1">
<h2 id="inflation-and-deflation"><a class="anchor" href="#inflation-and-deflation"></a>Inflation and Deflation</h2>
<div class="sectionbody">
<div class="paragraph">
<p>We’ve mentioned before that one reason Harry state is so compact and
can be validated so efficiently is because every value read from the
database can be traced back to the descriptor it was generated
from. To achieve this, we generate all values using order-preserving
bijections. In other words, for any value generated from a descriptor,
it should be possible to quickly find a descriptor this value was
generated from, and two values generated from two distinct descriptors
should sort the same as descriptors themselves.</p>
</div>
<div class="paragraph">
<p>Implementing an order-preserving bijection for 64-bit longs is trivial
and can be achieved by using an identity function. Essentially, any
long descriptor <em>is</em> the value it represents:</p>
</div>
<div class="listingblock">
<div class="content">
<pre class="highlightjs highlight"><code class="language-java hljs" data-lang="java">long inflate(long descriptor) {
return descriptor;
}
long deflate(long value) {
return value;
}</code></pre>
</div>
</div>
<div class="paragraph">
<p>There are many ways to make a bijection for strings. One of the ways
to do it is to have a set of 256 short strings of the same length in a
sorted array. When inflating a 64-bit long descriptor into the string,
we’ll be iterating over these 64 bits, taking 8 bits (one byte) at a
time, using the value of this byte as an index in an array of 256
strings.</p>
</div>
<div class="listingblock">
<div class="content">
<pre class="highlightjs highlight"><code class="language-java hljs" data-lang="java">String inflate(long descriptor) {
StringBuilder builder = new StringBuilder();
for (int i = 0; i &lt; Long.BYTES; i++) {
int idx = getByte(descriptor, i);
builder.append(nibbles[idx]);
}
return builder.toString();
}</code></pre>
</div>
</div>
<div class="paragraph">
<p>One thing we should take into account here is that strings are
compared byte-wise, while longs use signed comparison. To make sure
generated strings have the same order as descriptors, we need to XOR
the sign bit.</p>
</div>
<div class="paragraph">
<p>Since any two strings produced by this generator will be unique, and
we can produce at most 2^64 values using this generator, to generate
longer strings we do not even need larger nibbles. We can append
random data of arbitrary length to the end of the string. This does
not change the order since it is determined by the prefix generated
from nibbles that is unique to each value.</p>
</div>
<div class="paragraph">
<p>Such simple bijections can represent data types used for regular and
static columns. We’ve previously mentioned that partition and
clustering keys are also represented using 64-bit
descriptors. Partition and clustering keys are composite: they consist
of multiple distinct parts. One way to implement bijection for a
composite type is to “slice” 64 bits of entropy into smaller chunks,
each chunk giving some entropy to generate a different part of the
key. Each slice is then inflated using a bijection that corresponds to
the part of the key it represents. To convert the value back to the
descriptor, we must deflate each part of the key and then “stitch” the
values back together into a 64-bit descriptor.</p>
</div>
<div class="paragraph">
<p>To summarise, key generators are just bijections that can generate
multiple values for a single 64-bit descriptor instead of one. A
simplified and generalized version of such bijection may look
something like this:</p>
</div>
<div class="listingblock">
<div class="content">
<pre class="highlightjs highlight"><code class="language-java hljs" data-lang="java">Object[] inflate(long descriptor) {
long[] slices = slice(descriptor);
Object[] key = new Object[slices.length];
for (int i = 0; i &lt; slices.length; i++) {
key[i] = children[i].inflate(slices[i]);
}
return key;
}
long deflate(Object[] value) {
long[] slices = new long[value.length];
for (int i = 0; i &lt; value.length; i++) {
slices[i] = children[i].deflate(value[i]);
}
return stitch(slices);
}</code></pre>
</div>
</div>
<div class="paragraph">
<p>Values generated by key generators preserve the order of descriptors
they were generated from, which allows efficiently checking the order
of results, comparing clustering descriptors, and validating range
deletions.</p>
</div>
</div>
</div>
<div class="sect1">
<h2 id="putting-it-all-together"><a class="anchor" href="#putting-it-all-together"></a>Putting it all together</h2>
<div class="sectionbody">
<div class="paragraph">
<p>In this post, we’ve learned how the various parts of Harry work,
starting with how to reproduce a sequence of operations up to how the
values are generated. Using this information, we can create a
quiescent model checker that can validate the state of the database in
the absence of in-flight operations, assuming we know the state of all
operations before this moment.</p>
</div>
<div class="paragraph">
<p>As we’ve discussed, Harry is working with reproducible histories of
operations, where the following information identifies each operation:</p>
</div>
<div class="listingblock">
<div class="content">
<pre class="highlightjs highlight"><code class="language-java hljs" data-lang="java">class Operation {
long lts; // logical timestamp of the operation
long pd; // partition descriptor, derived from LTS
long cd; // clustering descriptor, derived from LTS and PD
long OperationKind; // operation type, derived from LTS and PD
}</code></pre>
</div>
</div>
<div class="paragraph">
<p>Now, all we need to do is to produce a sequence of operations. For example, each operation with <code>INSERT</code> kind is going to be represented by:</p>
</div>
<div class="listingblock">
<div class="content">
<pre class="highlightjs highlight"><code class="language-java hljs" data-lang="java">class Insert {
long lts; // logical timestamp
long pd; // partition descriptor
long cd; // clustering descriptor
long[] vds; // value descriptors for each column
}</code></pre>
</div>
</div>
<div class="paragraph">
<p>We can compile this information into an INSERT statement by inflating
partition, clustering, and value descriptors, and deriving the
real-time timestamp from the logical timestamp. This statement can
then be executed against the system under test (our Cassandra
cluster).</p>
</div>
<div class="paragraph">
<p>We know from the history of operations which partitions were visited,
and which operations were executed against them. To validate the state
of any given partition, all we need to do is to query the database to
retrieve partition state from the database, deflate every row returned
in the results: deflate all clustering keys into clustering
descriptors, and values into corresponding value descriptors,
producing internal PartitionState.</p>
</div>
<div class="paragraph">
<p>To verify that this partition state is also correct, we replay the
sequence of operations again. But instead of going all the way to the
generated INSERT statement, we operate with only descriptors and apply
operations sequentially to the PartitionState following usual
Cassandra reconciliation rules (i.e. last-write-wins, partition
tombstone &gt; tombstone &gt; write), compute logical timestamps and value
descriptors for each row.</p>
</div>
<div class="paragraph">
<p>Now, having two partition states: one “deflated” from the result set
returned by the system under test and one “inflated” from the logical
timestamp, we can directly compare them. If there are any
inconsistencies between the two sets, we can conclude there has been
an error.</p>
</div>
<div class="paragraph">
<p>Validating query results by reproducing the sequence of events is more
efficient than working with full operation logs holding all values for
every operation executed against the database state. This process can
be made even more efficient by validating rows individually while
having enough state in memory to validate a single row.</p>
</div>
<div class="paragraph">
<p>Since implementing other Cassandra features, such as partition
deletions, writes without primary key liveness, static columns, column
deletions, etc., do not require any additional information, and
follows the same rules, for the sake of brevity, it is not covered in
this post.</p>
</div>
<div class="paragraph">
<p>Quiescent checker is a very useful tool and can validate data sets
generated by sequential or concurrent runners, as long as the state of
each operation is known at the moment of validation. Since we simply
replay the sequence of events to reproduce the state, we can not have
any events whose state is unknown to the system.</p>
</div>
</div>
</div>
<div class="sect1">
<h2 id="closing-words"><a class="anchor" href="#closing-words"></a>Closing words</h2>
<div class="sectionbody">
<div class="paragraph">
<p>Harry is a tool that allows the testing of databases in ways we
weren’t able to before. Instead of creating test cases expressed in
specific schemas and statements, it enables us to describe sequences
of events we’d like tested and generate schemas and possible
interleavings of statements corresponding to given
specifications. Creating exhaustive test suites can take a lot of
time, and we can have contributor creativity poured into patches and
features, not into test examples. This approach also allows testing
different features in combination and checking old behaviors in the
presence of new ones without explicitly creating new tests.</p>
</div>
<div class="paragraph">
<p>That said, integration testing is not always enough, and often we do
not know which areas of the codebase can be problematic. Harry can be
helpful here, too, since it will generate data, execute the workload
against the system and validate the results. Contributors can focus on
implementing test scenarios such as behavior in presence of unexpected
failures, cluster expansions, or node replacements.</p>
</div>
<div class="paragraph">
<p>Harry is a productivity tool that encapsulates the properties of a
stress test and a model checker, allowing us to find issues and
improve the quality of Cassandra in ways that were not possible
before.</p>
</div>
</div>
</div>
</div>
</div>
</div>
<footer class="grad grad--two flex-center pb-xlarge">
<div class="inner text-center z2 relative">
<h2 class="white py-small">Get started with Cassandra, fast.</h2>
<a id="footer-cta" href="/_/quickstart.html" class="btn btn--filled ma-medium">Quickstart Guide</a>
</div>
<div class="inner flex flex-distribute-items mt-xlarge z2 relative">
<div class="col-2">
<div id="footer-logo" class="logo logo--footer mb-medium"><img src="../../assets/img/logo-white-r.png" alt="Cassandra Logo"></div>
<p>Apache Cassandra<img src="../../assets/img/registered.svg" alt="®" style="width:18px;"> powers mission-critical deployments with improved performance and unparalleled levels of scale in the cloud.</p>
<div class="footer-social-icons">
<a href="https://twitter.com/cassandra?lang=en" target="_blank"><img src="../../assets/img/twitter-icon-circle-white.svg" alt="twitter icon" width="24"></a>
<a href="https://www.linkedin.com/company/apache-cassandra/" target="_blank"><img src="../../assets/img/LI-In-Bug.png" alt="linked-in icon" width="24"></a>
<a href="https://www.youtube.com/c/PlanetCassandra" target="_blank"><img src="../../assets/img/youtube-icon.png" alt="youtube icon" width="24"></a>
</div>
</div>
<div class="col-2 flex flex-center">
<ul class="columns-2">
<li class="mb-small"><a href="/">Home</a></li>
<li class="mb-small"><a href="/_/cassandra-basics.html">Cassandra Basics</a></li>
<li class="mb-small"><a href="/_/quickstart.html">Quickstart</a></li>
<li class="mb-small"><a href="/_/ecosystem.html">Ecosystem</a></li>
<li class="mb-small"><a href="/doc/latest/">Documentation</a></li>
<li class="mb-small"><a href="/_/community.html">Community</a></li>
<li class="mb-small"><a href="/_/case-studies.html">Case Studies</a></li>
<li class="mb-small"><a href="/_/resources.html">Resources</a></li>
<li class="mb-small"><a href="/_/blog.html">Blog</a></li>
</ul>
</div>
</div>
</footer>
<div class="lower-footer bg-white pa-medium">
<div class="flex flex-row flex-vert-center">
<div class="pr-medium"><img src="../../assets/img//feather-small.png" alt="ASF" width="20"></div>
<div class="pr-medium"><a href="http://www.apache.org/" target="_blank">Foundation</a></div>
<div class="pr-medium"><a href="https://www.apache.org/events/current-event.html" target="_blank">Events</a></div>
<div class="pr-medium"><a href="https://www.apache.org/licenses/" target="_blank">License</a></div>
<div class="pr-medium"><a href="https://www.apache.org/foundation/thanks" target="_blank">Thanks</a></div>
<div class="pr-medium"><a href="https://www.apache.org/security" target="_blank">Security</a></div>
<div class="pr-medium"><a href="https://privacy.apache.org/policies/privacy-policy-public.html" target="_blank">Privacy</a></div>
<div class="pr-medium"><a href="https://www.apache.org/foundation/sponsorship" target="_blank">Sponsorship</a></div>
</div>
<p class="my-medium">© 2009-<script>document.write(new Date().getFullYear())</script> <a href="https://apache.org" target="_blank">The Apache Software Foundation</a> under the terms of the Apache License 2.0. Apache, the Apache feather logo, Apache Cassandra, Cassandra, and the Cassandra logo, are either registered trademarks or trademarks of The Apache Software Foundation.</p>
</div>
<div id="fade" class="hidden"></div>
<div id="modal" class="hidden">
<div id="close-modal" class="cursor-pointer"><svg viewBox="0 0 24 24" width="24" height="24" stroke="currentColor" stroke-width="2" fill="none" stroke-linecap="round" stroke-linejoin="round" class="css-i6dzq1"><line x1="18" y1="6" x2="6" y2="18"></line><line x1="6" y1="6" x2="18" y2="18"></line></svg></div>
<div id="mod-content" class="vid-mod-content resp-container"></div>
</div>
<script>
jQuery(function(){
var windowW = $(window).width();
$(document)
.on('click','.mobile-nav-icon',function(){
$('.main-nav').fadeIn();
})
.on('click','.main-nav',function(){
if(windowW <= 1000){
$(this).fadeOut();
}
})
.on('click','#version-toggle',function(){
$(this).toggleClass('active');
$(this).next().fadeToggle();
})
.on('click','#mobile-docs-nav-burger', function(){
$(this).toggleClass('active');
$('.docs-nav').toggleClass('active');
});
var url = window.location.pathname;
var isQuickstart = url.includes('quickstart.html');
if(isQuickstart){
var footerCTA = document.getElementById('footer-cta');
footerCTA.innerHTML = 'Get latest updates';
footerCTA.setAttribute('href', '/_/blog.html');
}
});
</script>
</div>
</body>
<script>
jQuery(function(){
});
</script>
</html>