| <!DOCTYPE html> |
| <html lang="en"> |
| <head> |
| <meta charset="utf-8"> |
| <meta name="viewport" content="width=device-width,initial-scale=1.0"> |
| <title>Dynamo | Apache Cassandra Documentation</title> |
| <link rel="stylesheet" href="../../../../assets/css/site.css"> |
| <link rel="schema.dcterms" href="https://purl.org/dc/terms/"> |
| <meta name="dcterms.subject" content="Cassandra"> |
| <meta name="dcterms.identifier" content="4.1"> |
| <meta name="generator" content="Antora 2.3.4"> |
| <link rel="icon" href="../../../../assets/img/favicon.ico" type="image/x-icon"> |
| <script> |
| const script = document.createElement("script"); |
| const domain = window.location.hostname; |
| script.type = "text/javascript"; |
| script.src = "https://plausible.cassandra.apache.org/js/plausible.js"; |
| script.setAttribute("data-domain",domain); |
| script.setAttribute("defer",'true'); |
| script.setAttribute("async",'true'); |
| document.getElementsByTagName("head")[0].appendChild(script); |
| </script> </head> |
| <body class="docs-wrapper article"> |
| <div class="container mx-auto relative"> |
| <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.6.0/jquery.min.js"></script> |
| <meta property="og:type" content="website" /> |
| <meta property="og:url" content="/" /> |
| <meta property="og:site_name" content="Apache Cassandra" /> |
| |
| <header id="top-nav"> |
| <div class="inner relative"> |
| <div class="header-social-icons text-right"> |
| <a href="https://twitter.com/cassandra?lang=en" target="_blank" styles="margin-left: 20px;"><img src="../../../../assets/img/twitter-icon-circle-white.svg" alt="twitter icon" width="24"></a> |
| <a href="https://www.linkedin.com/company/apache-cassandra/" target="_blank" styles="margin-left: 20px;"><img src="../../../../assets/img/LI-In-Bug.png" alt="linked-in icon" width="24"></a> |
| <a href="https://www.youtube.com/c/PlanetCassandra" target="_blank" styles="margin-left: 20px;"><img src="../../../../assets/img/youtube-icon.png" alt="youtube icon" width="24"></a> |
| </div> |
| <div class="cf"> |
| <div class="logo left"><a href="/"><img src="../../../../assets/img/logo-white-r.png" alt="Cassandra Logo"></a></div> |
| <div class="mobile-nav-icon right"> |
| <img class="toggle-icon" src="../../../../assets/img/hamburger-nav.svg"> |
| </div> |
| <ul class="main-nav nav-links right flex flex-vert-center flex-space-between"> |
| <li> |
| <a class="nav-link hide-mobile">Get Started</a> |
| <ul class="sub-menu bg-white"> |
| <li class="pa-micro"> |
| <a href="/_/cassandra-basics.html"> |
| <div class="sub-nav-icon"> |
| <img src="../../../../assets/img/sub-menu-basics.png" alt="cassandra basics icon"> |
| </div> |
| <div class="sub-nav-text teal py-small"> |
| Cassandra Basics |
| </div> |
| </a> |
| </li> |
| <li class="pa-micro"> |
| <a href="/_/quickstart.html"> |
| <div class="sub-nav-icon"> |
| <img src="../../../../assets/img/sub-menu-rocket.png" alt="cassandra basics icon"> |
| </div> |
| <div class="sub-nav-text teal py-small"> |
| Quickstart |
| </div> |
| </a> |
| </li> |
| <li class="pa-micro"> |
| <a href="/_/ecosystem.html"> |
| <div class="sub-nav-icon"> |
| <img src="../../../../assets/img/sub-menu-ecosystem.png" alt="cassandra basics icon"> |
| </div> |
| <div class="sub-nav-text teal py-small"> |
| Ecosystem |
| </div> |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li><a class="nav-link" href="/doc/latest/">Documentation</a></li> |
| <li> |
| <a class="nav-link" href="/_/community.html">Community</a> |
| <ul class="sub-menu bg-white"> |
| <li class="pa-micro"> |
| <a href="/_/community.html#code-of-conduct"> |
| <div class="sub-nav-icon"> |
| <img src="../../../../assets/img/sub-menu-welcome.png" alt="welcome icon"> |
| </div> |
| <div class="sub-nav-text teal py-small"> |
| Welcome |
| </div> |
| </a> |
| </li> |
| <li class="pa-micro hide-mobile"> |
| <a href="/_/community.html#discussions"> |
| <div class="sub-nav-icon"> |
| <img src="../../../../assets/img/sub-menu-discussions.png" alt="discussions icon"> |
| </div> |
| <div class="sub-nav-text teal py-small"> |
| Discussions |
| </div> |
| </a> |
| </li> |
| <li class="pa-micro hide-mobile"> |
| <a href="/_/community.html#project-governance"> |
| <div class="sub-nav-icon"> |
| <img src="../../../../assets/img/sub-menu-governance.png" alt="Governance icon"> |
| </div> |
| <div class="sub-nav-text teal py-small"> |
| Governance |
| </div> |
| </a> |
| </li> |
| <li class="pa-micro hide-mobile"> |
| <a href="/_/community.html#how-to-contribute"> |
| <div class="sub-nav-icon"> |
| <img src="../../../../assets/img/sub-menu-contribute.png" alt="Contribute icon"> |
| </div> |
| <div class="sub-nav-text teal py-small"> |
| Contribute |
| </div> |
| </a> |
| </li> |
| <li class="pa-micro hide-mobile"> |
| <a href="/_/community.html#meet-the-community"> |
| <div class="sub-nav-icon"> |
| <img src="../../../../assets/img/sub-menu-community.png" alt="Meet the Community icon"> |
| </div> |
| <div class="sub-nav-text teal py-small"> |
| Meet the Community |
| </div> |
| </a> |
| </li> |
| <li class="pa-micro hide-mobile"> |
| <a href="/_/cassandra-catalyst-program.html"> |
| <div class="sub-nav-icon"> |
| <img src="../../../../assets/img/sub-menu-catalyst.png" alt="Catalyst icon"> |
| </div> |
| <div class="sub-nav-text teal py-small"> |
| Catalyst Program |
| </div> |
| </a> |
| </li> |
| <li class="pa-micro hide-mobile"> |
| <a href="/_/events.html"> |
| <div class="sub-nav-icon"> |
| <img src="../../../../assets/img/sub-menu-events.png" alt="Events icon"> |
| </div> |
| <div class="sub-nav-text teal py-small"> |
| Events |
| </div> |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li> |
| <a class="nav-link hide-mobile">Learn</a> |
| <ul class="sub-menu bg-white"> |
| <li class="pa-micro"> |
| <a href="/_/Apache-Cassandra-5.0-Moving-Toward-an-AI-Driven-Future.html"> |
| <div class="sub-nav-icon"> |
| <img src="../../../../assets/img/sub-menu-basics.png" alt="Basics icon"> |
| </div> |
| <div class="sub-nav-text teal py-small"> |
| Cassandra 5.0 |
| </div> |
| </a> |
| </li> |
| <li class="pa-micro"> |
| <a href="/_/case-studies.html"> |
| <div class="sub-nav-icon"> |
| <img src="../../../../assets/img/sub-menu-case-study.png" alt="Case Studies icon"> |
| </div> |
| <div class="sub-nav-text teal py-small"> |
| Case Studies |
| </div> |
| </a> |
| </li> |
| <li class="pa-micro"> |
| <a href="/_/resources.html"> |
| <div class="sub-nav-icon"> |
| <img src="../../../../assets/img/sub-menu-resources.png" alt="Resources icon"> |
| </div> |
| <div class="sub-nav-text teal py-small"> |
| Resources |
| </div> |
| </a> |
| </li> |
| <li class="pa-micro"> |
| <a href="/_/blog.html"> |
| <div class="sub-nav-icon"> |
| <img src="../../../../assets/img/sub-menu-blog.png" alt="Blog icon"> |
| </div> |
| <div class="sub-nav-text teal py-small"> |
| Blog |
| </div> |
| </a> |
| </li> |
| </ul> |
| </li> |
| <li><a class="nav-link btn btn--filled" href="/_/download.html">Download Now</a></li> |
| </ul> |
| </div> |
| </div> |
| </header> |
| |
| <div class="hero hero--home grad"> |
| <div class="eye"></div> |
| <div id="docs-content" class="text-center flex flex-center flex-column relative z2 ma-xlarge"> |
| <h2>Cassandra Documentation</h2> |
| </div> |
| </div> |
| <div class="body px-medium py-medium container"> |
| <div class="docs-nav-bar flex flex-space-between mb-medium"> |
| <div id="mobile-docs-nav-burger" class="hidden"> |
| <svg viewBox="0 0 24 24" width="36" height="36" stroke="#1c81a0" stroke-width="2.5" fill="none" stroke-linecap="round" stroke-linejoin="round" class="css-i6dzq1"><line x1="3" y1="12" x2="21" y2="12"></line><line x1="3" y1="6" x2="21" y2="6"></line><line x1="3" y1="18" x2="21" y2="18"></line></svg> |
| </div> |
| <div class="docs-nav-item relative"> |
| <input id="search-input" type="text" placeholder="Search docs"> |
| </div> |
| <div class="versions-wrapper"> |
| <h4>Version:</h4> |
| <div class="nav-panel-explore" data-panel="explore"> |
| |
| <div id="version-toggle" class="context"> |
| <span class="version">4.1</span> |
| </div> |
| <ul id="versions-list" class="components"> |
| <li class="component"> |
| <ul class="versions"> |
| <li class="version is-latest"> |
| <a href="../../../../_/index.html">master</a> |
| </li> |
| </ul> |
| </li> |
| <li class="component is-current"> |
| <ul class="versions"> |
| <li class="version"> |
| <a href="../../../trunk/index.html">trunk</a> |
| </li> |
| <li class="version"> |
| <a href="../../../5.0/index.html">5.0</a> |
| </li> |
| <li class="version is-current is-latest"> |
| <a href="../../index.html">4.1</a> |
| </li> |
| <li class="version"> |
| <a href="../../../4.0/index.html">4.0</a> |
| </li> |
| <li class="version"> |
| <a href="../../../3.11/index.html">3.11</a> |
| </li> |
| </ul> |
| </li> |
| </ul> |
| </div> |
| </div> </div> |
| <div class="cf relative"> |
| <nav class="nav docs-nav full-800"> |
| <div class="nav-menu"> |
| <ul class="nav-list"> |
| <li class="nav-item is-active" data-depth="0"> |
| <ul class="nav-list"> |
| <li class="nav-item" data-depth="1"> |
| <span class="nav-line"> |
| <button class="nav-toggle"></button> |
| <a class="nav-link" href="../../index.html">Main</a> |
| </span> |
| <ul class="nav-list"> |
| <li class="nav-item" data-depth="2"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../../../../_/glossary.html">Glossary</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="2"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../../../../_/bugs.html">How to report bugs</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="2"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../../../../_/contactus.html">Contact us</a> |
| </span> |
| </li> |
| </ul> |
| </li> |
| </ul> |
| </li> |
| <li class="nav-item is-active" data-depth="0"> |
| <ul class="nav-list"> |
| <li class="nav-item is-current-path is-active" data-depth="1"> |
| <span class="nav-line"> |
| <button class="nav-toggle"></button> |
| <span class="nav-text">Cassandra</span> |
| </span> |
| <ul class="nav-list"> |
| <li class="nav-item" data-depth="2"> |
| <span class="nav-line"> |
| <button class="nav-toggle"></button> |
| <a class="nav-link" href="../getting_started/index.html">Getting Started</a> |
| </span> |
| <ul class="nav-list"> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../getting_started/installing.html">Installing Cassandra</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../getting_started/configuring.html">Configuring Cassandra</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../getting_started/querying.html">Inserting and querying</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../getting_started/drivers.html">Client drivers</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../getting_started/java11.html">Support for Java 11</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../getting_started/production.html">Production recommendations</a> |
| </span> |
| </li> |
| </ul> |
| </li> |
| <li class="nav-item" data-depth="2"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../new/index.html">What’s new</a> |
| </span> |
| </li> |
| <li class="nav-item is-current-path is-active" data-depth="2"> |
| <span class="nav-line"> |
| <button class="nav-toggle"></button> |
| <a class="nav-link" href="index.html">Architecture</a> |
| </span> |
| <ul class="nav-list"> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="overview.html">Overview</a> |
| </span> |
| </li> |
| <li class="nav-item is-current-page is-active" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="dynamo.html">Dynamo</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="storage_engine.html">Storage engine</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="guarantees.html">Guarantees</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="messaging.html">Improved internode messaging</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="streaming.html">Improved streaming</a> |
| </span> |
| </li> |
| </ul> |
| </li> |
| <li class="nav-item" data-depth="2"> |
| <span class="nav-line"> |
| <button class="nav-toggle"></button> |
| <a class="nav-link" href="../data_modeling/index.html">Data modeling</a> |
| </span> |
| <ul class="nav-list"> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../data_modeling/intro.html">Introduction</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../data_modeling/data_modeling_conceptual.html">Conceptual data modeling</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../data_modeling/data_modeling_rdbms.html">RDBMS design</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../data_modeling/data_modeling_queries.html">Defining application queries</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../data_modeling/data_modeling_logical.html">Logical data modeling</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../data_modeling/data_modeling_physical.html">Physical data modeling</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../data_modeling/data_modeling_refining.html">Evaluating and refining data models</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../data_modeling/data_modeling_schema.html">Defining database schema</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../data_modeling/data_modeling_tools.html">Cassandra data modeling tools</a> |
| </span> |
| </li> |
| </ul> |
| </li> |
| <li class="nav-item" data-depth="2"> |
| <span class="nav-line"> |
| <button class="nav-toggle"></button> |
| <a class="nav-link" href="../cql/index.html">Cassandra Query Language (CQL)</a> |
| </span> |
| <ul class="nav-list"> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../cql/definitions.html">Definitions</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../cql/types.html">Data types</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../cql/ddl.html">Data definition (DDL)</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../cql/dml.html">Data manipulation (DML)</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../cql/operators.html">Operators</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../cql/indexes.html">Secondary indexes</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../cql/mvs.html">Materialized views</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../cql/functions.html">Functions</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../cql/json.html">JSON</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../cql/security.html">Security</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../cql/triggers.html">Triggers</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../cql/appendices.html">Appendices</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../cql/changes.html">Changes</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../cql/SASI.html">SASI</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../cql/cql_singlefile.html">Single file of CQL information</a> |
| </span> |
| </li> |
| </ul> |
| </li> |
| <li class="nav-item" data-depth="2"> |
| <span class="nav-line"> |
| <button class="nav-toggle"></button> |
| <a class="nav-link" href="../configuration/index.html">Configuration</a> |
| </span> |
| <ul class="nav-list"> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../configuration/cass_yaml_file.html">cassandra.yaml</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../configuration/cass_rackdc_file.html">cassandra-rackdc.properties</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../configuration/cass_env_sh_file.html">cassandra-env.sh</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../configuration/cass_topo_file.html">cassandra-topologies.properties</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../configuration/cass_cl_archive_file.html">commitlog-archiving.properties</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../configuration/cass_logback_xml_file.html">logback.xml</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../configuration/cass_jvm_options_file.html">jvm-* files</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../configuration/configuration.html">Liberating cassandra.yaml Parameters' Names from Their Units</a> |
| </span> |
| </li> |
| </ul> |
| </li> |
| <li class="nav-item" data-depth="2"> |
| <span class="nav-line"> |
| <button class="nav-toggle"></button> |
| <a class="nav-link" href="../operating/index.html">Operating</a> |
| </span> |
| <ul class="nav-list"> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="#operating/snitch.adoc">Snitches</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../operating/topo_changes.html">Topology changes</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../operating/repair.html">Repair</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../operating/read_repair.html">Read repair</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../operating/hints.html">Hints</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../operating/bloom_filters.html">Bloom filters</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../operating/compression.html">Compression</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../operating/cdc.html">Change Data Capture (CDC)</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../operating/backups.html">Backups</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../operating/bulk_loading.html">Bulk loading</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../operating/metrics.html">Metrics</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../operating/security.html">Security</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../operating/hardware.html">Hardware</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../operating/compaction/index.html">Compaction</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../operating/virtualtables.html">Virtual tables</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../operating/auditlogging.html">Audit logging</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../operating/audit_logging.html">Audit logging 2</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../operating/fqllogging.html">Full query logging</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../operating/transientreplication.html">Transient replication</a> |
| </span> |
| </li> |
| </ul> |
| </li> |
| <li class="nav-item" data-depth="2"> |
| <span class="nav-line"> |
| <button class="nav-toggle"></button> |
| <a class="nav-link" href="../tools/index.html">Tools</a> |
| </span> |
| <ul class="nav-list"> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../tools/cqlsh.html">cqlsh: the CQL shell</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../tools/nodetool/nodetool.html">nodetool</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../tools/sstable/index.html">SSTable tools</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../tools/cassandra_stress.html">cassandra-stress</a> |
| </span> |
| </li> |
| </ul> |
| </li> |
| <li class="nav-item" data-depth="2"> |
| <span class="nav-line"> |
| <button class="nav-toggle"></button> |
| <a class="nav-link" href="../troubleshooting/index.html">Troubleshooting</a> |
| </span> |
| <ul class="nav-list"> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../troubleshooting/finding_nodes.html">Finding misbehaving nodes</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../troubleshooting/reading_logs.html">Reading Cassandra logs</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../troubleshooting/use_nodetool.html">Using nodetool</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../troubleshooting/use_tools.html">Using external tools to deep-dive</a> |
| </span> |
| </li> |
| </ul> |
| </li> |
| <li class="nav-item" data-depth="2"> |
| <span class="nav-line"> |
| <button class="nav-toggle"></button> |
| <a class="nav-link" href="../../../../_/development/index.html">Development</a> |
| </span> |
| <ul class="nav-list"> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../../../../_/development/gettingstarted.html">Getting started</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../../../../_/development/ide.html">Building and IDE integration</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../../../../_/development/testing.html">Testing</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../../../../_/development/patches.html">Contributing code changes</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../../../../_/development/code_style.html">Code style</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../../../../_/development/how_to_review.html">Review checklist</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../../../../_/development/how_to_commit.html">How to commit</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../../../../_/development/documentation.html">Working on documentation</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../../../../_/development/ci.html">Jenkins CI environment</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../../../../_/development/dependencies.html">Dependency management</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="3"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../../../../_/development/release_process.html">Release process</a> |
| </span> |
| </li> |
| </ul> |
| </li> |
| <li class="nav-item" data-depth="2"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../faq/index.html">FAQ</a> |
| </span> |
| </li> |
| <li class="nav-item" data-depth="2"> |
| <span class="nav-line"> |
| <a class="nav-link" href="../plugins/index.html">Plug-ins</a> |
| </span> |
| </li> |
| </ul> |
| </li> |
| </ul> |
| </li> |
| </ul> |
| </div> |
| </nav> |
| <aside class="toc sidebar"> |
| <div class="toc-menu"></div> |
| </aside> |
| <main class="article default-main full-800" data-ceiling="topbar"> |
| <div class="article-header"> |
| <nav class="crumbs" aria-label="breadcrumbs"> |
| <ul> |
| <li class="crumb">Cassandra</li> |
| <li class="crumb"><a href="index.html">Architecture</a></li> |
| <li class="crumb"><a href="dynamo.html">Dynamo</a></li> |
| </ul> |
| </nav> |
| <div class="tools" role="navigation"> |
| <ul> |
| <li class="tool edit"><a href="https://github.com/apache/cassandra/edit/cassandra-4.1/doc/modules/cassandra/pages/architecture/dynamo.adoc" title="Edit Page" target="_blank" rel="noopener">Edit</a></li> |
| </ul> |
| </div> |
| </div> |
| <article class="doc"> |
| <h1 class="page">Dynamo</h1> |
| <div id="preamble"> |
| <div class="sectionbody"> |
| <div class="paragraph"> |
| <p>Apache Cassandra relies on a number of techniques from Amazon’s |
| <a href="http://courses.cse.tamu.edu/caverlee/csce438/readings/dynamo-paper.pdf">Dynamo</a> |
| distributed storage key-value system. Each node in the Dynamo system has |
| three main components:</p> |
| </div> |
| <div class="ulist"> |
| <ul> |
| <li> |
| <p>Request coordination over a partitioned dataset</p> |
| </li> |
| <li> |
| <p>Ring membership and failure detection</p> |
| </li> |
| <li> |
| <p>A local persistence (storage) engine</p> |
| </li> |
| </ul> |
| </div> |
| <div class="paragraph"> |
| <p>Cassandra primarily draws from the first two clustering components, |
| while using a storage engine based on a Log Structured Merge Tree |
| (<a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.44.2782&rep=rep1&type=pdf">LSM</a>). |
| In particular, Cassandra relies on Dynamo style:</p> |
| </div> |
| <div class="ulist"> |
| <ul> |
| <li> |
| <p>Dataset partitioning using consistent hashing</p> |
| </li> |
| <li> |
| <p>Multi-master replication using versioned data and tunable consistency</p> |
| </li> |
| <li> |
| <p>Distributed cluster membership and failure detection via a gossip |
| protocol</p> |
| </li> |
| <li> |
| <p>Incremental scale-out on commodity hardware</p> |
| </li> |
| </ul> |
| </div> |
| <div class="paragraph"> |
| <p>Cassandra was designed this way to meet large-scale (PiB+) |
| business-critical storage requirements. In particular, as applications |
| demanded full global replication of petabyte scale datasets along with |
| always available low-latency reads and writes, it became imperative to |
| design a new kind of database model as the relational database systems |
| of the time struggled to meet the new requirements of global scale |
| applications.</p> |
| </div> |
| </div> |
| </div> |
| <div class="sect1"> |
| <h2 id="dataset-partitioning-consistent-hashing"><a class="anchor" href="#dataset-partitioning-consistent-hashing"></a>Dataset Partitioning: Consistent Hashing</h2> |
| <div class="sectionbody"> |
| <div class="paragraph"> |
| <p>Cassandra achieves horizontal scalability by |
| <a href="https://en.wikipedia.org/wiki/Partition_(database)">partitioning</a> all |
| data stored in the system using a hash function. Each partition is |
| replicated to multiple physical nodes, often across failure domains such |
| as racks and even datacenters. As every replica can independently accept |
| mutations to every key that it owns, every key must be versioned. Unlike |
| in the original Dynamo paper where deterministic versions and vector |
| clocks were used to reconcile concurrent updates to a key, Cassandra |
| uses a simpler last write wins model where every mutation is timestamped |
| (including deletes) and then the latest version of data is the "winning" |
| value. Formally speaking, Cassandra uses a Last-Write-Wins Element-Set |
| conflict-free replicated data type for each CQL row, or |
| <a href="https://en.wikipedia.org/wiki/Conflict-free_replicated_data_type" class="bare">en.wikipedia.org/wiki/Conflict-free_replicated_data_type</a> LWW-Element-Set_(Last-Write-Wins-Element-Set)[LWW-Element-Set |
| CRDT], to resolve conflicting mutations on replica sets.</p> |
| </div> |
| <div class="sect2"> |
| <h3 id="consistent-hashing-using-a-token-ring"><a class="anchor" href="#consistent-hashing-using-a-token-ring"></a>Consistent Hashing using a Token Ring</h3> |
| <div class="paragraph"> |
| <p>Cassandra partitions data over storage nodes using a special form of |
| hashing called |
| <a href="https://en.wikipedia.org/wiki/Consistent_hashing">consistent hashing</a>. In |
| naive data hashing, you typically allocate keys to buckets by taking a |
| hash of the key modulo the number of buckets. For example, if you want |
| to distribute data to 100 nodes using naive hashing you might assign |
| every node to a bucket between 0 and 100, hash the input key modulo 100, |
| and store the data on the associated bucket. In this naive scheme, |
| however, adding a single node might invalidate almost all of the |
| mappings.</p> |
| </div> |
| <div class="paragraph"> |
| <p>Cassandra instead maps every node to one or more tokens on a continuous |
| hash ring, and defines ownership by hashing a key onto the ring and then |
| "walking" the ring in one direction, similar to the |
| <a href="https://pdos.csail.mit.edu/papers/chord:sigcomm01/chord_sigcomm.pdf">Chord</a> |
| algorithm. The main difference of consistent hashing to naive data |
| hashing is that when the number of nodes (buckets) to hash into changes, |
| consistent hashing only has to move a small fraction of the keys.</p> |
| </div> |
| <div class="paragraph"> |
| <p>For example, if we have an eight node cluster with evenly spaced tokens, |
| and a replication factor (RF) of 3, then to find the owning nodes for a |
| key we first hash that key to generate a token (which is just the hash |
| of the key), and then we "walk" the ring in a clockwise fashion until we |
| encounter three distinct nodes, at which point we have found all the |
| replicas of that key. This example of an eight node cluster with |
| gRF=3 can be visualized as follows:</p> |
| </div> |
| <div class="imageblock"> |
| <div class="content"> |
| <img src="../_images/ring.svg" alt="image"> |
| </div> |
| </div> |
| <div class="paragraph"> |
| <p>You can see that in a Dynamo like system, ranges of keys, also known as |
| <strong>token ranges</strong>, map to the same physical set of nodes. In this example, |
| all keys that fall in the token range excluding token 1 and including |
| token 2 (grange(t1, t2]) are stored on nodes 2, 3 and 4.</p> |
| </div> |
| </div> |
| <div class="sect2"> |
| <h3 id="multiple-tokens-per-physical-node-vnodes"><a class="anchor" href="#multiple-tokens-per-physical-node-vnodes"></a>Multiple Tokens per Physical Node (vnodes)</h3> |
| <div class="paragraph"> |
| <p>Simple single token consistent hashing works well if you have many |
| physical nodes to spread data over, but with evenly spaced tokens and a |
| small number of physical nodes, incremental scaling (adding just a few |
| nodes of capacity) is difficult because there are no token selections |
| for new nodes that can leave the ring balanced. Cassandra seeks to avoid |
| token imbalance because uneven token ranges lead to uneven request load. |
| For example, in the previous example there is no way to add a ninth |
| token without causing imbalance; instead we would have to insert <code>8</code> |
| tokens in the midpoints of the existing ranges.</p> |
| </div> |
| <div class="paragraph"> |
| <p>The Dynamo paper advocates for the use of "virtual nodes" to solve this |
| imbalance problem. Virtual nodes solve the problem by assigning multiple |
| tokens in the token ring to each physical node. By allowing a single |
| physical node to take multiple positions in the ring, we can make small |
| clusters look larger and therefore even with a single physical node |
| addition we can make it look like we added many more nodes, effectively |
| taking many smaller pieces of data from more ring neighbors when we add |
| even a single node.</p> |
| </div> |
| <div class="paragraph"> |
| <p>Cassandra introduces some nomenclature to handle these concepts:</p> |
| </div> |
| <div class="ulist"> |
| <ul> |
| <li> |
| <p><strong>Token</strong>: A single position on the dynamo style hash |
| ring.</p> |
| </li> |
| <li> |
| <p><strong>Endpoint</strong>: A single physical IP and port on the network.</p> |
| </li> |
| <li> |
| <p><strong>Host ID</strong>: A unique identifier for a single "physical" node, usually |
| present at one gEndpoint and containing one or more |
| gTokens.</p> |
| </li> |
| <li> |
| <p><strong>Virtual Node</strong> (or <strong>vnode</strong>): A gToken on the hash ring |
| owned by the same physical node, one with the same gHost |
| ID.</p> |
| </li> |
| </ul> |
| </div> |
| <div class="paragraph"> |
| <p>The mapping of <strong>Tokens</strong> to <strong>Endpoints</strong> gives rise to the <strong>Token Map</strong> |
| where Cassandra keeps track of what ring positions map to which physical |
| endpoints. For example, in the following figure we can represent an |
| eight node cluster using only four physical nodes by assigning two |
| tokens to every node:</p> |
| </div> |
| <div class="imageblock"> |
| <div class="content"> |
| <img src="../_images/vnodes.svg" alt="image"> |
| </div> |
| </div> |
| <div class="paragraph"> |
| <p>Multiple tokens per physical node provide the following benefits:</p> |
| </div> |
| <div class="olist arabic"> |
| <ol class="arabic"> |
| <li> |
| <p>When a new node is added it accepts approximately equal amounts of |
| data from other nodes in the ring, resulting in equal distribution of |
| data across the cluster.</p> |
| </li> |
| <li> |
| <p>When a node is decommissioned, it loses data roughly equally to other |
| members of the ring, again keeping equal distribution of data across the |
| cluster.</p> |
| </li> |
| <li> |
| <p>If a node becomes unavailable, query load (especially token aware |
| query load), is evenly distributed across many other nodes.</p> |
| </li> |
| </ol> |
| </div> |
| <div class="paragraph"> |
| <p>Multiple tokens, however, can also have disadvantages:</p> |
| </div> |
| <div class="olist arabic"> |
| <ol class="arabic"> |
| <li> |
| <p>Every token introduces up to <code>2 * (RF - 1)</code> additional neighbors on |
| the token ring, which means that there are more combinations of node |
| failures where we lose availability for a portion of the token ring. The |
| more tokens you have, |
| <a href="https://jolynch.github.io/pdf/cassandra-availability-virtual.pdf">the |
| higher the probability of an outage</a>.</p> |
| </li> |
| <li> |
| <p>Cluster-wide maintenance operations are often slowed. For example, as |
| the number of tokens per node is increased, the number of discrete |
| repair operations the cluster must do also increases.</p> |
| </li> |
| <li> |
| <p>Performance of operations that span token ranges could be affected.</p> |
| </li> |
| </ol> |
| </div> |
| <div class="paragraph"> |
| <p>Note that in Cassandra <code>2.x</code>, the only token allocation algorithm |
| available was picking random tokens, which meant that to keep balance |
| the default number of tokens per node had to be quite high, at <code>256</code>. |
| This had the effect of coupling many physical endpoints together, |
| increasing the risk of unavailability. That is why in <code>3.x +</code> the new |
| deterministic token allocator was added which intelligently picks tokens |
| such that the ring is optimally balanced while requiring a much lower |
| number of tokens per physical node.</p> |
| </div> |
| </div> |
| </div> |
| </div> |
| <div class="sect1"> |
| <h2 id="multi-master-replication-versioned-data-and-tunable-consistency"><a class="anchor" href="#multi-master-replication-versioned-data-and-tunable-consistency"></a>Multi-master Replication: Versioned Data and Tunable Consistency</h2> |
| <div class="sectionbody"> |
| <div class="paragraph"> |
| <p>Cassandra replicates every partition of data to many nodes across the |
| cluster to maintain high availability and durability. When a mutation |
| occurs, the coordinator hashes the partition key to determine the token |
| range the data belongs to and then replicates the mutation to the |
| replicas of that data according to the |
| <code>Replication Strategy</code>.</p> |
| </div> |
| <div class="paragraph"> |
| <p>All replication strategies have the notion of a <strong>replication factor</strong> |
| (<code>RF</code>), which indicates to Cassandra how many copies of the partition |
| should exist. For example with a <code>RF=3</code> keyspace, the data will be |
| written to three distinct <strong>replicas</strong>. Replicas are always chosen such |
| that they are distinct physical nodes which is achieved by skipping |
| virtual nodes if needed. Replication strategies may also choose to skip |
| nodes present in the same failure domain such as racks or datacenters so |
| that Cassandra clusters can tolerate failures of whole racks and even |
| datacenters of nodes.</p> |
| </div> |
| <div class="sect2"> |
| <h3 id="replication-strategy"><a class="anchor" href="#replication-strategy"></a>Replication Strategy</h3> |
| <div class="paragraph"> |
| <p>Cassandra supports pluggable <strong>replication strategies</strong>, which determine |
| which physical nodes act as replicas for a given token range. Every |
| keyspace of data has its own replication strategy. All production |
| deployments should use the <code>NetworkTopologyStrategy</code> while the |
| <code>SimpleStrategy</code> replication strategy is useful only for testing |
| clusters where you do not yet know the datacenter layout of the cluster.</p> |
| </div> |
| <div class="sect3"> |
| <h4 id="network-topology-strategy"><a class="anchor" href="#network-topology-strategy"></a><code>NetworkTopologyStrategy</code></h4> |
| <div class="paragraph"> |
| <p><code>NetworkTopologyStrategy</code> requires a specified replication factor |
| for each datacenter in the cluster. Even if your cluster only uses a |
| single datacenter, <code>NetworkTopologyStrategy</code> is recommended over |
| <code>SimpleStrategy</code> to make it easier to add new physical or virtual |
| datacenters to the cluster later, if required.</p> |
| </div> |
| <div class="paragraph"> |
| <p>In addition to allowing the replication factor to be specified |
| individually by datacenter, <code>NetworkTopologyStrategy</code> also attempts to |
| choose replicas within a datacenter from different racks as specified by |
| the <code>Snitch</code>. If the number of racks is greater than or equal |
| to the replication factor for the datacenter, each replica is guaranteed |
| to be chosen from a different rack. Otherwise, each rack will hold at |
| least one replica, but some racks may hold more than one. Note that this |
| rack-aware behavior has some potentially |
| <a href="https://issues.apache.org/jira/browse/CASSANDRA-3810">surprising |
| implications</a>. For example, if there are not an even number of nodes in |
| each rack, the data load on the smallest rack may be much higher. |
| Similarly, if a single node is bootstrapped into a brand new rack, it |
| will be considered a replica for the entire ring. For this reason, many |
| operators choose to configure all nodes in a single availability zone or |
| similar failure domain as a single "rack".</p> |
| </div> |
| </div> |
| <div class="sect3"> |
| <h4 id="simple-strategy"><a class="anchor" href="#simple-strategy"></a><code>SimpleStrategy</code></h4> |
| <div class="paragraph"> |
| <p><code>SimpleStrategy</code> allows a single integer <code>replication_factor</code> to be |
| defined. This determines the number of nodes that should contain a copy |
| of each row. For example, if <code>replication_factor</code> is 3, then three |
| different nodes should store a copy of each row.</p> |
| </div> |
| <div class="paragraph"> |
| <p><code>SimpleStrategy</code> treats all nodes identically, ignoring any configured |
| datacenters or racks. To determine the replicas for a token range, |
| Cassandra iterates through the tokens in the ring, starting with the |
| token range of interest. For each token, it checks whether the owning |
| node has been added to the set of replicas, and if it has not, it is |
| added to the set. This process continues until <code>replication_factor</code> |
| distinct nodes have been added to the set of replicas.</p> |
| </div> |
| </div> |
| <div class="sect3"> |
| <h4 id="transient-replication"><a class="anchor" href="#transient-replication"></a>Transient Replication</h4> |
| <div class="paragraph"> |
| <p>Transient replication is an experimental feature in Cassandra 4.0 not |
| present in the original Dynamo paper. This feature allows configuration of a |
| subset of replicas to replicate only data that hasn’t been incrementally |
| repaired. This configuration decouples data redundancy from availability. |
| For instance, if you have a keyspace replicated at RF=3, and alter it to |
| RF=5 with two transient replicas, you go from tolerating one |
| failed replica to tolerating two, without corresponding |
| increase in storage usage. Now, three nodes will replicate all |
| the data for a given token range, and the other two will only replicate |
| data that hasn’t been incrementally repaired.</p> |
| </div> |
| <div class="paragraph"> |
| <p>To use transient replication, first enable the option in |
| <code>cassandra.yaml</code>. Once enabled, both <code>SimpleStrategy</code> and |
| <code>NetworkTopologyStrategy</code> can be configured to transiently replicate |
| data. Configure it by specifying replication factor as |
| <code><total_replicas>/<transient_replicas</code> Both <code>SimpleStrategy</code> and |
| <code>NetworkTopologyStrategy</code> support configuring transient replication.</p> |
| </div> |
| <div class="paragraph"> |
| <p>Transiently replicated keyspaces only support tables created with |
| <code>read_repair</code> set to <code>NONE</code>; monotonic reads are not currently |
| supported. You also can’t use <code>LWT</code>, logged batches, or counters in 4.0. |
| You will possibly never be able to use materialized views with |
| transiently replicated keyspaces and probably never be able to use |
| secondary indices with them.</p> |
| </div> |
| <div class="paragraph"> |
| <p>Transient replication is an experimental feature that is not ready |
| for production use. The expected audience is experienced users of |
| Cassandra capable of fully validating a deployment of their particular |
| application. That means being able check that operations like reads, |
| writes, decommission, remove, rebuild, repair, and replace all work with |
| your queries, data, configuration, operational practices, and |
| availability requirements.</p> |
| </div> |
| <div class="paragraph"> |
| <p>Anticipated additional features in <code>4.next</code> are support for monotonic reads with |
| transient replication, as well as LWT, logged batches, and counters.</p> |
| </div> |
| </div> |
| </div> |
| <div class="sect2"> |
| <h3 id="data-versioning"><a class="anchor" href="#data-versioning"></a>Data Versioning</h3> |
| <div class="paragraph"> |
| <p>Cassandra uses mutation timestamp versioning to guarantee eventual |
| consistency of data. Specifically all mutations that enter the system do |
| so with a timestamp provided either from a client clock or, absent a |
| client provided timestamp, from the coordinator node’s clock. Updates |
| resolve according to the conflict resolution rule of last write wins. |
| Cassandra’s correctness does depend on these clocks, so make sure a |
| proper time synchronization process is running such as NTP.</p> |
| </div> |
| <div class="paragraph"> |
| <p>Cassandra applies separate mutation timestamps to every column of every |
| row within a CQL partition. Rows are guaranteed to be unique by primary |
| key, and each column in a row resolve concurrent mutations according to |
| last-write-wins conflict resolution. This means that updates to |
| different primary keys within a partition can actually resolve without |
| conflict! Furthermore the CQL collection types such as maps and sets use |
| this same conflict free mechanism, meaning that concurrent updates to |
| maps and sets are guaranteed to resolve as well.</p> |
| </div> |
| <div class="sect3"> |
| <h4 id="replica-synchronization"><a class="anchor" href="#replica-synchronization"></a>Replica Synchronization</h4> |
| <div class="paragraph"> |
| <p>As replicas in Cassandra can accept mutations independently, it is |
| possible for some replicas to have newer data than others. Cassandra has |
| many best-effort techniques to drive convergence of replicas including |
| <code>Replica read repair <read-repair></code> in the read path and |
| <code>Hinted handoff <hints></code> in the write path.</p> |
| </div> |
| <div class="paragraph"> |
| <p>These techniques are only best-effort, however, and to guarantee |
| eventual consistency Cassandra implements <code>anti-entropy |
| repair <repair></code> where replicas calculate hierarchical hash-trees over |
| their datasets called <a href="https://en.wikipedia.org/wiki/Merkle_tree">Merkle |
| trees</a> that can then be compared across replicas to identify mismatched |
| data. Like the original Dynamo paper Cassandra supports full repairs |
| where replicas hash their entire dataset, create Merkle trees, send them |
| to each other and sync any ranges that don’t match.</p> |
| </div> |
| <div class="paragraph"> |
| <p>Unlike the original Dynamo paper, Cassandra also implements sub-range |
| repair and incremental repair. Sub-range repair allows Cassandra to |
| increase the resolution of the hash trees (potentially down to the |
| single partition level) by creating a larger number of trees that span |
| only a portion of the data range. Incremental repair allows Cassandra to |
| only repair the partitions that have changed since the last repair.</p> |
| </div> |
| </div> |
| </div> |
| <div class="sect2"> |
| <h3 id="tunable-consistency"><a class="anchor" href="#tunable-consistency"></a>Tunable Consistency</h3> |
| <div class="paragraph"> |
| <p>Cassandra supports a per-operation tradeoff between consistency and |
| availability through <strong>Consistency Levels</strong>. Cassandra’s consistency |
| levels are a version of Dynamo’s <code>R + W > N</code> consistency mechanism where |
| operators could configure the number of nodes that must participate in |
| reads (<code>R</code>) and writes (<code>W</code>) to be larger than the replication factor |
| (<code>N</code>). In Cassandra, you instead choose from a menu of common |
| consistency levels which allow the operator to pick <code>R</code> and <code>W</code> behavior |
| without knowing the replication factor. Generally writes will be visible |
| to subsequent reads when the read consistency level contains enough |
| nodes to guarantee a quorum intersection with the write consistency |
| level.</p> |
| </div> |
| <div class="paragraph"> |
| <p>The following consistency levels are available:</p> |
| </div> |
| <div class="dlist"> |
| <dl> |
| <dt class="hdlist1"><code>ONE</code></dt> |
| <dd> |
| <p>Only a single replica must respond.</p> |
| </dd> |
| <dt class="hdlist1"><code>TWO</code></dt> |
| <dd> |
| <p>Two replicas must respond.</p> |
| </dd> |
| <dt class="hdlist1"><code>THREE</code></dt> |
| <dd> |
| <p>Three replicas must respond.</p> |
| </dd> |
| <dt class="hdlist1"><code>QUORUM</code></dt> |
| <dd> |
| <p>A majority (n/2 + 1) of the replicas must respond.</p> |
| </dd> |
| <dt class="hdlist1"><code>ALL</code></dt> |
| <dd> |
| <p>All of the replicas must respond.</p> |
| </dd> |
| <dt class="hdlist1"><code>LOCAL_QUORUM</code></dt> |
| <dd> |
| <p>A majority of the replicas in the local datacenter (whichever |
| datacenter the coordinator is in) must respond.</p> |
| </dd> |
| <dt class="hdlist1"><code>EACH_QUORUM</code></dt> |
| <dd> |
| <p>A majority of the replicas in each datacenter must respond.</p> |
| </dd> |
| <dt class="hdlist1"><code>LOCAL_ONE</code></dt> |
| <dd> |
| <p>Only a single replica must respond. In a multi-datacenter cluster, |
| this also gaurantees that read requests are not sent to replicas in a |
| remote datacenter.</p> |
| </dd> |
| <dt class="hdlist1"><code>ANY</code></dt> |
| <dd> |
| <p>A single replica may respond, or the coordinator may store a hint. If |
| a hint is stored, the coordinator will later attempt to replay the |
| hint and deliver the mutation to the replicas. This consistency level |
| is only accepted for write operations.</p> |
| </dd> |
| </dl> |
| </div> |
| <div class="paragraph"> |
| <p>Write operations <strong>are always sent to all replicas</strong>, regardless of |
| consistency level. The consistency level simply controls how many |
| responses the coordinator waits for before responding to the client.</p> |
| </div> |
| <div class="paragraph"> |
| <p>For read operations, the coordinator generally only issues read commands |
| to enough replicas to satisfy the consistency level. The one exception |
| to this is when speculative retry may issue a redundant read request to |
| an extra replica if the original replicas have not responded within a |
| specified time window.</p> |
| </div> |
| <div class="sect3"> |
| <h4 id="picking-consistency-levels"><a class="anchor" href="#picking-consistency-levels"></a>Picking Consistency Levels</h4> |
| <div class="paragraph"> |
| <p>It is common to pick read and write consistency levels such that the |
| replica sets overlap, resulting in all acknowledged writes being visible |
| to subsequent reads. This is typically expressed in the same terms |
| Dynamo does, in that <code>W + R > RF</code>, where <code>W</code> is the write consistency |
| level, <code>R</code> is the read consistency level, and <code>RF</code> is the replication |
| factor. For example, if <code>RF = 3</code>, a <code>QUORUM</code> request will require |
| responses from at least <code>2/3</code> replicas. If <code>QUORUM</code> is used for both |
| writes and reads, at least one of the replicas is guaranteed to |
| participate in <em>both</em> the write and the read request, which in turn |
| guarantees that the quorums will overlap and the write will be visible |
| to the read.</p> |
| </div> |
| <div class="paragraph"> |
| <p>In a multi-datacenter environment, <code>LOCAL_QUORUM</code> can be used to provide |
| a weaker but still useful guarantee: reads are guaranteed to see the |
| latest write from within the same datacenter. This is often sufficient |
| as clients homed to a single datacenter will read their own writes.</p> |
| </div> |
| <div class="paragraph"> |
| <p>If this type of strong consistency isn’t required, lower consistency |
| levels like <code>LOCAL_ONE</code> or <code>ONE</code> may be used to improve throughput, |
| latency, and availability. With replication spanning multiple |
| datacenters, <code>LOCAL_ONE</code> is typically less available than <code>ONE</code> but is |
| faster as a rule. Indeed <code>ONE</code> will succeed if a single replica is |
| available in any datacenter.</p> |
| </div> |
| </div> |
| </div> |
| </div> |
| </div> |
| <div class="sect1"> |
| <h2 id="distributed-cluster-membership-and-failure-detection"><a class="anchor" href="#distributed-cluster-membership-and-failure-detection"></a>Distributed Cluster Membership and Failure Detection</h2> |
| <div class="sectionbody"> |
| <div class="paragraph"> |
| <p>The replication protocols and dataset partitioning rely on knowing which |
| nodes are alive and dead in the cluster so that write and read |
| operations can be optimally routed. In Cassandra liveness information is |
| shared in a distributed fashion through a failure detection mechanism |
| based on a gossip protocol.</p> |
| </div> |
| <div class="sect2"> |
| <h3 id="gossip"><a class="anchor" href="#gossip"></a>Gossip</h3> |
| <div class="paragraph"> |
| <p>Gossip is how Cassandra propagates basic cluster bootstrapping |
| information such as endpoint membership and internode network protocol |
| versions. In Cassandra’s gossip system, nodes exchange state information |
| not only about themselves but also about other nodes they know about. |
| This information is versioned with a vector clock of |
| <code>(generation, version)</code> tuples, where the generation is a monotonic |
| timestamp and version is a logical clock the increments roughly every |
| second. These logical clocks allow Cassandra gossip to ignore old |
| versions of cluster state just by inspecting the logical clocks |
| presented with gossip messages.</p> |
| </div> |
| <div class="paragraph"> |
| <p>Every node in the Cassandra cluster runs the gossip task independently |
| and periodically. Every second, every node in the cluster:</p> |
| </div> |
| <div class="olist arabic"> |
| <ol class="arabic"> |
| <li> |
| <p>Updates the local node’s heartbeat state (the version) and constructs |
| the node’s local view of the cluster gossip endpoint state.</p> |
| </li> |
| <li> |
| <p>Picks a random other node in the cluster to exchange gossip endpoint |
| state with.</p> |
| </li> |
| <li> |
| <p>Probabilistically attempts to gossip with any unreachable nodes (if |
| one exists)</p> |
| </li> |
| <li> |
| <p>Gossips with a seed node if that didn’t happen in step 2.</p> |
| </li> |
| </ol> |
| </div> |
| <div class="paragraph"> |
| <p>When an operator first bootstraps a Cassandra cluster they designate |
| certain nodes as seed nodes. Any node can be a seed node and the only |
| difference between seed and non-seed nodes is seed nodes are allowed to |
| bootstrap into the ring without seeing any other seed nodes. |
| Furthermore, once a cluster is bootstrapped, seed nodes become |
| hotspots for gossip due to step 4 above.</p> |
| </div> |
| <div class="paragraph"> |
| <p>As non-seed nodes must be able to contact at least one seed node in |
| order to bootstrap into the cluster, it is common to include multiple |
| seed nodes, often one for each rack or datacenter. Seed nodes are often |
| chosen using existing off-the-shelf service discovery mechanisms.</p> |
| </div> |
| <div class="admonitionblock note"> |
| <table> |
| <tr> |
| <td class="icon"> |
| <i class="fa icon-note" title="Note"></i> |
| </td> |
| <td class="content"> |
| <div class="title">Note</div> |
| <div class="paragraph"> |
| <p>Nodes do not have to agree on the seed nodes, and indeed once a cluster |
| is bootstrapped, newly launched nodes can be configured to use any |
| existing nodes as seeds. The only advantage to picking the same nodes |
| as seeds is it increases their usefullness as gossip hotspots.</p> |
| </div> |
| </td> |
| </tr> |
| </table> |
| </div> |
| <div class="paragraph"> |
| <p>Currently, gossip also propagates token metadata and schema |
| <em>version</em> information. This information forms the control plane for |
| scheduling data movements and schema pulls. For example, if a node sees |
| a mismatch in schema version in gossip state, it will schedule a schema |
| sync task with the other nodes. As token information propagates via |
| gossip it is also the control plane for teaching nodes which endpoints |
| own what data.</p> |
| </div> |
| </div> |
| <div class="sect2"> |
| <h3 id="ring-membership-and-failure-detection"><a class="anchor" href="#ring-membership-and-failure-detection"></a>Ring Membership and Failure Detection</h3> |
| <div class="paragraph"> |
| <p>Gossip forms the basis of ring membership, but the <strong>failure detector</strong> |
| ultimately makes decisions about if nodes are <code>UP</code> or <code>DOWN</code>. Every node |
| in Cassandra runs a variant of the |
| <a href="https://www.computer.org/csdl/proceedings-article/srds/2004/22390066/12OmNvT2phv">Phi |
| Accrual Failure Detector</a>, in which every node is constantly making an |
| independent decision of if their peer nodes are available or not. This |
| decision is primarily based on received heartbeat state. For example, if |
| a node does not see an increasing heartbeat from a node for a certain |
| amount of time, the failure detector "convicts" that node, at which |
| point Cassandra will stop routing reads to it (writes will typically be |
| written to hints). If/when the node starts heartbeating again, Cassandra |
| will try to reach out and connect, and if it can open communication |
| channels it will mark that node as available.</p> |
| </div> |
| <div class="admonitionblock note"> |
| <table> |
| <tr> |
| <td class="icon"> |
| <i class="fa icon-note" title="Note"></i> |
| </td> |
| <td class="content"> |
| <div class="title">Note</div> |
| <div class="paragraph"> |
| <p><code>UP</code> and <code>DOWN</code> state are local node decisions and are not propagated with |
| gossip. Heartbeat state is propagated with gossip, but nodes will not |
| consider each other as <code>UP</code> until they can successfully message each |
| other over an actual network channel.</p> |
| </div> |
| </td> |
| </tr> |
| </table> |
| </div> |
| <div class="paragraph"> |
| <p>Cassandra will never remove a node from gossip state without |
| explicit instruction from an operator via a decommission operation or a |
| new node bootstrapping with a <code>replace_address_first_boot</code> option. This |
| choice is intentional to allow Cassandra nodes to temporarily fail |
| without causing data to needlessly re-balance. This also helps to |
| prevent simultaneous range movements, where multiple replicas of a token |
| range are moving at the same time, which can violate monotonic |
| consistency and can even cause data loss.</p> |
| </div> |
| </div> |
| </div> |
| </div> |
| <div class="sect1"> |
| <h2 id="incremental-scale-out-on-commodity-hardware"><a class="anchor" href="#incremental-scale-out-on-commodity-hardware"></a>Incremental Scale-out on Commodity Hardware</h2> |
| <div class="sectionbody"> |
| <div class="paragraph"> |
| <p>Cassandra scales-out to meet the requirements of growth in data size and |
| request rates. Scaling-out means adding additional nodes to the ring, |
| and every additional node brings linear improvements in compute and |
| storage. In contrast, scaling-up implies adding more capacity to the |
| existing database nodes. Cassandra is also capable of scale-up, and in |
| certain environments it may be preferable depending on the deployment. |
| Cassandra gives operators the flexibility to chose either scale-out or |
| scale-up.</p> |
| </div> |
| <div class="paragraph"> |
| <p>One key aspect of Dynamo that Cassandra follows is to attempt to run on |
| commodity hardware, and many engineering choices are made under this |
| assumption. For example, Cassandra assumes nodes can fail at any time, |
| auto-tunes to make the best use of CPU and memory resources available |
| and makes heavy use of advanced compression and caching techniques to |
| get the most storage out of limited memory and storage capabilities.</p> |
| </div> |
| <div class="sect2"> |
| <h3 id="simple-query-model"><a class="anchor" href="#simple-query-model"></a>Simple Query Model</h3> |
| <div class="paragraph"> |
| <p>Cassandra, like Dynamo, chooses not to provide cross-partition |
| transactions that are common in SQL Relational Database Management |
| Systems (RDBMS). This both gives the programmer a simpler read and write |
| API, and allows Cassandra to more easily scale horizontally since |
| multi-partition transactions spanning multiple nodes are notoriously |
| difficult to implement and typically very latent.</p> |
| </div> |
| <div class="paragraph"> |
| <p>Instead, Cassanda chooses to offer fast, consistent, latency at any |
| scale for single partition operations, allowing retrieval of entire |
| partitions or only subsets of partitions based on primary key filters. |
| Furthermore, Cassandra does support single partition compare and swap |
| functionality via the lightweight transaction CQL API.</p> |
| </div> |
| </div> |
| <div class="sect2"> |
| <h3 id="simple-interface-for-storing-records"><a class="anchor" href="#simple-interface-for-storing-records"></a>Simple Interface for Storing Records</h3> |
| <div class="paragraph"> |
| <p>Cassandra, in a slight departure from Dynamo, chooses a storage |
| interface that is more sophisticated then "simple key value" stores but |
| significantly less complex than SQL relational data models. Cassandra |
| presents a wide-column store interface, where partitions of data contain |
| multiple rows, each of which contains a flexible set of individually |
| typed columns. Every row is uniquely identified by the partition key and |
| one or more clustering keys, and every row can have as many columns as |
| needed.</p> |
| </div> |
| <div class="paragraph"> |
| <p>This allows users to flexibly add new columns to existing datasets as |
| new requirements surface. Schema changes involve only metadata changes |
| and run fully concurrently with live workloads. Therefore, users can |
| safely add columns to existing Cassandra databases while remaining |
| confident that query performance will not degrade.</p> |
| </div> |
| </div> |
| </div> |
| </div> |
| </article> |
| </main> |
| </div> |
| </div> |
| <footer class="grad grad--two flex-center pb-xlarge"> |
| <div class="inner text-center z2 relative"> |
| <h2 class="white py-small">Get started with Cassandra, fast.</h2> |
| <a id="footer-cta" href="/_/quickstart.html" class="btn btn--filled ma-medium">Quickstart Guide</a> |
| </div> |
| <div class="inner flex flex-distribute-items mt-xlarge z2 relative"> |
| <div class="col-2"> |
| <div id="footer-logo" class="logo logo--footer mb-medium"><img src="../../../../assets/img/logo-white-r.png" alt="Cassandra Logo"></div> |
| <p>Apache Cassandra<img src="../../../../assets/img/registered.svg" alt="®" style="width:18px;"> powers mission-critical deployments with improved performance and unparalleled levels of scale in the cloud.</p> |
| <div class="footer-social-icons"> |
| <a href="https://twitter.com/cassandra?lang=en" target="_blank"><img src="../../../../assets/img/twitter-icon-circle-white.svg" alt="twitter icon" width="24"></a> |
| <a href="https://www.linkedin.com/company/apache-cassandra/" target="_blank"><img src="../../../../assets/img/LI-In-Bug.png" alt="linked-in icon" width="24"></a> |
| <a href="https://www.youtube.com/c/PlanetCassandra" target="_blank"><img src="../../../../assets/img/youtube-icon.png" alt="youtube icon" width="24"></a> |
| </div> |
| </div> |
| <div class="col-2 flex flex-center"> |
| <ul class="columns-2"> |
| <li class="mb-small"><a href="/">Home</a></li> |
| <li class="mb-small"><a href="/_/cassandra-basics.html">Cassandra Basics</a></li> |
| <li class="mb-small"><a href="/_/quickstart.html">Quickstart</a></li> |
| <li class="mb-small"><a href="/_/ecosystem.html">Ecosystem</a></li> |
| <li class="mb-small"><a href="/doc/latest/">Documentation</a></li> |
| <li class="mb-small"><a href="/_/community.html">Community</a></li> |
| <li class="mb-small"><a href="/_/case-studies.html">Case Studies</a></li> |
| <li class="mb-small"><a href="/_/resources.html">Resources</a></li> |
| <li class="mb-small"><a href="/_/blog.html">Blog</a></li> |
| </ul> |
| </div> |
| </div> |
| </footer> |
| <div class="lower-footer bg-white pa-medium"> |
| <div class="flex flex-row flex-vert-center"> |
| <div class="pr-medium"><img src="../../../../assets/img//feather-small.png" alt="ASF" width="20"></div> |
| <div class="pr-medium"><a href="http://www.apache.org/" target="_blank">Foundation</a></div> |
| <div class="pr-medium"><a href="https://www.apache.org/events/current-event.html" target="_blank">Events</a></div> |
| <div class="pr-medium"><a href="https://www.apache.org/licenses/" target="_blank">License</a></div> |
| <div class="pr-medium"><a href="https://www.apache.org/foundation/thanks" target="_blank">Thanks</a></div> |
| <div class="pr-medium"><a href="https://www.apache.org/security" target="_blank">Security</a></div> |
| <div class="pr-medium"><a href="https://privacy.apache.org/policies/privacy-policy-public.html" target="_blank">Privacy</a></div> |
| <div class="pr-medium"><a href="https://www.apache.org/foundation/sponsorship" target="_blank">Sponsorship</a></div> |
| </div> |
| <p class="my-medium">© 2009-<script>document.write(new Date().getFullYear())</script> <a href="https://apache.org" target="_blank">The Apache Software Foundation</a> under the terms of the Apache License 2.0. Apache, the Apache feather logo, Apache Cassandra, Cassandra, and the Cassandra logo, are either registered trademarks or trademarks of The Apache Software Foundation.</p> |
| </div> |
| <div id="fade" class="hidden"></div> |
| <div id="modal" class="hidden"> |
| <div id="close-modal" class="cursor-pointer"><svg viewBox="0 0 24 24" width="24" height="24" stroke="currentColor" stroke-width="2" fill="none" stroke-linecap="round" stroke-linejoin="round" class="css-i6dzq1"><line x1="18" y1="6" x2="6" y2="18"></line><line x1="6" y1="6" x2="18" y2="18"></line></svg></div> |
| <div id="mod-content" class="vid-mod-content resp-container"></div> |
| </div> |
| <script src="../../../../assets/js/site.js"></script> |
| <script async src="../../../../assets/js/vendor/highlight.js"></script> |
| <script src="../../../../assets/js/vendor/lunr.js"></script> |
| <script src="../../../../assets/js/vendor/search.js" id="search-script" data-base-path="../../../.." data-page-path="/Cassandra/4.1/cassandra/architecture/dynamo.html"></script> |
| <script async src="../../../../assets/../search-index.js"></script> |
| <script> |
| jQuery(function(){ |
| var windowW = $(window).width(); |
| $(document) |
| .on('click','.mobile-nav-icon',function(){ |
| $('.main-nav').fadeIn(); |
| }) |
| .on('click','.main-nav',function(){ |
| if(windowW <= 1000){ |
| $(this).fadeOut(); |
| } |
| }) |
| .on('click','#version-toggle',function(){ |
| $(this).toggleClass('active'); |
| $(this).next().fadeToggle(); |
| }) |
| .on('click','#mobile-docs-nav-burger', function(){ |
| $(this).toggleClass('active'); |
| $('.docs-nav').toggleClass('active'); |
| }); |
| var url = window.location.pathname; |
| var isQuickstart = url.includes('quickstart.html'); |
| if(isQuickstart){ |
| var footerCTA = document.getElementById('footer-cta'); |
| footerCTA.innerHTML = 'Get latest updates'; |
| footerCTA.setAttribute('href', '/_/blog.html'); |
| } |
| }); |
| </script> |
| </div> |
| </body> |
| </html> |