blob: 83d798a88c71767fd437e927c4d419ed598e904e [file] [log] [blame]
<!doctype html>
<html lang="en" dir="ltr" class="docs-wrapper docs-doc-page docs-version-4.9.2 plugin-docs plugin-id-default docs-doc-id-development/protocol">
<head>
<meta charset="UTF-8">
<meta name="generator" content="Docusaurus v2.4.0">
<title data-rh="true">The BookKeeper protocol | Apache BookKeeper</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://bookkeeper.apache.org/docs/4.9.2/development/protocol"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docusaurus_version" content="4.9.2"><meta data-rh="true" name="docusaurus_tag" content="docs-default-4.9.2"><meta data-rh="true" name="docsearch:version" content="4.9.2"><meta data-rh="true" name="docsearch:docusaurus_tag" content="docs-default-4.9.2"><meta data-rh="true" property="og:title" content="The BookKeeper protocol | Apache BookKeeper"><meta data-rh="true" name="description" content="BookKeeper uses a special replication protocol for guaranteeing persistent storage of entries in an ensemble of bookies."><meta data-rh="true" property="og:description" content="BookKeeper uses a special replication protocol for guaranteeing persistent storage of entries in an ensemble of bookies."><link data-rh="true" rel="icon" href="/img/favicon.ico"><link data-rh="true" rel="canonical" href="https://bookkeeper.apache.org/docs/4.9.2/development/protocol"><link data-rh="true" rel="alternate" href="https://bookkeeper.apache.org/docs/4.9.2/development/protocol" hreflang="en"><link data-rh="true" rel="alternate" href="https://bookkeeper.apache.org/docs/4.9.2/development/protocol" hreflang="x-default"><link rel="stylesheet" href="/assets/css/styles.49914aab.css">
<link rel="preload" href="/assets/js/runtime~main.1c369ecb.js" as="script">
<link rel="preload" href="/assets/js/main.812b2dbb.js" as="script">
</head>
<body class="navigation-with-keyboard">
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}return t}()||function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}()</script><div id="__docusaurus">
<div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#docusaurus_skipToContent_fallback">Skip to main content</a></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><div class="navbar__logo"><img src="/img/bk-logo.svg" alt="Apache Bookkeeper" class="themedImage_ToTc themedImage--light_HNdA"><img src="/img/bk-logo.svg" alt="Apache Bookkeeper" class="themedImage_ToTc themedImage--dark_i4oU"></div><b class="navbar__title text--truncate">Apache BookKeeper</b></a><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/docs/4.9.2/overview/">Documentation</a><div class="navbar__item dropdown dropdown--hoverable"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Community</a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/community/mailing-lists">Mailing lists</a></li><li><a class="dropdown__link" href="/community/slack">Slack</a></li><li><a href="https://github.com/apache/bookkeeper/issues" target="_blank" rel="noopener noreferrer" class="dropdown__link">Github issues<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li><a class="dropdown__link" href="/community/releases">Release management</a></li><li><a class="dropdown__link" href="/community/meeting">Community meetings</a></li><li><a class="dropdown__link" href="/community/contributing">Contribution guide</a></li><li><a class="dropdown__link" href="/community/coding-guide">Coding guide</a></li><li><a class="dropdown__link" href="/community/testing">Testing guide</a></li><li><a class="dropdown__link" href="/community/issue-report">Issue report guide</a></li><li><a class="dropdown__link" href="/community/release-guide">Release guide</a></li><li><a class="dropdown__link" href="/community/presentations">Presentations</a></li><li><a class="dropdown__link" href="/community/bookkeeper-proposals">BookKeeper proposals (BP)</a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Project</a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/project/who">Who are we?</a></li><li><a class="dropdown__link" href="/project/bylaws">Bylaws</a></li><li><a href="https://apache.org/licenses" target="_blank" rel="noopener noreferrer" class="dropdown__link">License<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li><a class="dropdown__link" href="/project/privacy">Privacy policy</a></li><li><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Sponsorship<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Thanks<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div></div><div class="navbar__items navbar__items--right"><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a class="navbar__link" aria-haspopup="true" aria-expanded="false" role="button" href="/docs/4.9.2/overview/">4.9.2</a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/docs/next/development/protocol">Next</a></li><li><a class="dropdown__link" href="/docs/development/protocol">4.16.5</a></li><li><a class="dropdown__link" href="/docs/4.15.5/development/protocol">4.15.5</a></li><li><a class="dropdown__link" href="/docs/4.14.8/development/protocol">4.14.8</a></li><li><a class="dropdown__link" href="/docs/4.13.0/development/protocol">4.13.0</a></li><li><a class="dropdown__link" href="/docs/4.12.1/development/protocol">4.12.1</a></li><li><a class="dropdown__link" href="/docs/4.11.1/development/protocol">4.11.1</a></li><li><a class="dropdown__link" href="/docs/4.10.0/development/protocol">4.10.0</a></li><li><a aria-current="page" class="dropdown__link dropdown__link--active" href="/docs/4.9.2/development/protocol">4.9.2</a></li><li><a class="dropdown__link" href="/docs/4.8.2/development/protocol">4.8.2</a></li><li><a class="dropdown__link" href="/docs/4.7.3/development/protocol">4.7.3</a></li><li><a class="dropdown__link" href="/docs/4.6.2/development/protocol">4.6.2</a></li><li><a class="dropdown__link" href="/docs/4.5.1/development/protocol">4.5.1</a></li></ul></div><a class="navbar__item navbar__link" href="/releases">Download</a><div class="toggle_vylO colorModeToggle_DEke"><button class="clean-btn toggleButton_gllP toggleButtonDisabled_aARS" type="button" disabled="" title="Switch between dark and light mode (currently light mode)" aria-label="Switch between dark and light mode (currently light mode)" aria-live="polite"><svg viewBox="0 0 24 24" width="24" height="24" class="lightToggleIcon_pyhR"><path fill="currentColor" d="M12,9c1.65,0,3,1.35,3,3s-1.35,3-3,3s-3-1.35-3-3S10.35,9,12,9 M12,7c-2.76,0-5,2.24-5,5s2.24,5,5,5s5-2.24,5-5 S14.76,7,12,7L12,7z M2,13l2,0c0.55,0,1-0.45,1-1s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S1.45,13,2,13z M20,13l2,0c0.55,0,1-0.45,1-1 s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S19.45,13,20,13z M11,2v2c0,0.55,0.45,1,1,1s1-0.45,1-1V2c0-0.55-0.45-1-1-1S11,1.45,11,2z M11,20v2c0,0.55,0.45,1,1,1s1-0.45,1-1v-2c0-0.55-0.45-1-1-1C11.45,19,11,19.45,11,20z M5.99,4.58c-0.39-0.39-1.03-0.39-1.41,0 c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0s0.39-1.03,0-1.41L5.99,4.58z M18.36,16.95 c-0.39-0.39-1.03-0.39-1.41,0c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0c0.39-0.39,0.39-1.03,0-1.41 L18.36,16.95z M19.42,5.99c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06c-0.39,0.39-0.39,1.03,0,1.41 s1.03,0.39,1.41,0L19.42,5.99z M7.05,18.36c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06 c-0.39,0.39-0.39,1.03,0,1.41s1.03,0.39,1.41,0L7.05,18.36z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" class="darkToggleIcon_wfgR"><path fill="currentColor" d="M9.37,5.51C9.19,6.15,9.1,6.82,9.1,7.5c0,4.08,3.32,7.4,7.4,7.4c0.68,0,1.35-0.09,1.99-0.27C17.45,17.19,14.93,19,12,19 c-3.86,0-7-3.14-7-7C5,9.07,6.81,6.55,9.37,5.51z M12,3c-4.97,0-9,4.03-9,9s4.03,9,9,9s9-4.03,9-9c0-0.46-0.04-0.92-0.1-1.36 c-0.98,1.37-2.58,2.26-4.4,2.26c-2.98,0-5.4-2.42-5.4-5.4c0-1.81,0.89-3.42,2.26-4.4C12.92,3.04,12.46,3,12,3L12,3z"></path></svg></button></div><div class="searchBox_ZlJk"></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0 docsWrapper_BCFX"><button aria-label="Scroll back to top" class="clean-btn theme-back-to-top-button backToTopButton_sjWU" type="button"></button><div class="docPage__5DB"><aside class="theme-doc-sidebar-container docSidebarContainer_b6E3"><div class="sidebarViewport_Xe31"><div class="sidebar_njMd"><nav aria-label="Docs sidebar" class="menu thin-scrollbar menu_SIkG"><ul class="theme-doc-sidebar-menu menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-1 menu__list-item"><a class="menu__link" href="/docs/4.9.2/overview/">Overview</a></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/4.9.2/getting-started/installation">Getting started</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/4.9.2/deployment/manual">Deployment</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/4.9.2/admin/bookies">Administration</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/4.9.2/api/overview">API</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/4.9.2/security/overview">Security</a></div></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret menu__link--active" aria-expanded="true" href="/docs/4.9.2/development/protocol">Development</a></div><ul style="display:block;overflow:visible;height:auto" class="menu__list"><li class="theme-doc-sidebar-item-link theme-doc-sidebar-item-link-level-2 menu__list-item"><a class="menu__link menu__link--active" aria-current="page" tabindex="0" href="/docs/4.9.2/development/protocol">BookKeeper protocol</a></li></ul></li><li class="theme-doc-sidebar-item-category theme-doc-sidebar-item-category-level-1 menu__list-item menu__list-item--collapsed"><div class="menu__list-item-collapsible"><a class="menu__link menu__link--sublist menu__link--sublist-caret" aria-expanded="false" href="/docs/4.9.2/reference/config">Reference</a></div></li></ul></nav></div></div></aside><main class="docMainContainer_gTbr"><div class="container padding-top--md padding-bottom--lg"><div class="row"><div class="col docItemCol_VOVn"><div class="docItemContainer_Djhp"><article><span class="theme-doc-version-badge badge badge--secondary">Version: 4.9.2</span><div class="tocCollapsible_ETCw theme-doc-toc-mobile tocMobile_ITEo"><button type="button" class="clean-btn tocCollapsibleButton_TO0P">On this page</button></div><div class="theme-doc-markdown markdown"><header><h1>The BookKeeper protocol</h1></header><p>BookKeeper uses a special replication protocol for guaranteeing persistent storage of entries in an ensemble of bookies.</p><blockquote><p>This document assumes that you have some knowledge of leader election and log replication and how these can be used in a distributed system. If not, we recommend reading the <a href="/docs/4.9.2/api/ledger-api#example-application">example application</a> documentation first.</p></blockquote><h2 class="anchor anchorWithStickyNavbar_LWe7" id="ledgers">Ledgers<a href="#ledgers" class="hash-link" aria-label="Direct link to Ledgers" title="Direct link to Ledgers"></a></h2><p>Ledgers are the basic building block of BookKeeper and the level at which BookKeeper makes its persistent storage guarantees. A replicated log consists of an ordered list of ledgers. See <a href="#ledgers-to-logs">Ledgers to logs</a> for info on building a replicated log from ledgers.</p><p>Ledgers are composed of metadata and entries. The metadata is stored in ZooKeeper, which provides a <em>compare-and-swap</em> (CAS) operation. Entries are stored on storage nodes known as bookies.</p><p>A ledger has a single writer and multiple readers (SWMR).</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="ledger-metadata">Ledger metadata<a href="#ledger-metadata" class="hash-link" aria-label="Direct link to Ledger metadata" title="Direct link to Ledger metadata"></a></h3><p>A ledger&#x27;s metadata contains the following:</p><table><thead><tr><th align="left">Parameter</th><th align="left">Name</th><th align="left">Meaning</th></tr></thead><tbody><tr><td align="left">Identifer</td><td align="left"></td><td align="left">A 64-bit integer, unique within the system</td></tr><tr><td align="left">Ensemble size</td><td align="left"><strong>E</strong></td><td align="left">The number of nodes the ledger is stored on</td></tr><tr><td align="left">Write quorum size</td><td align="left"><strong>Q<sub>w</sub></strong></td><td align="left">The number of nodes each entry is written to. In effect, the max replication for the entry.</td></tr><tr><td align="left">Ack quorum size</td><td align="left"><strong>Q<sub>a</sub></strong></td><td align="left">The number of nodes an entry must be acknowledged on. In effect, the minimum replication for the entry.</td></tr><tr><td align="left">Current state</td><td align="left"></td><td align="left">The current status of the ledger. One of <code>OPEN</code>, <code>CLOSED</code>, or <code>IN_RECOVERY</code>.</td></tr><tr><td align="left">Last entry</td><td align="left"></td><td align="left">The last entry in the ledger or <code>NULL</code> is the current state is not <code>CLOSED</code>.</td></tr></tbody></table><p>In addition, each ledger&#x27;s metadata consists of one or more <em>fragments</em>. Each fragment is either</p><ul><li>the first entry of a fragment or</li><li>a list of bookies for the fragment.</li></ul><p>When creating a ledger, the following invariant must hold:</p><p><strong>E &gt;= Q<sub>w</sub> &gt;= Qa</strong></p><p>Thus, the ensemble size (<strong>E</strong>) must be larger than the write quorum size (<strong>Q<sub>w</sub></strong>), which must in turn be larger than the ack quorum size (<strong>Q<sub>a</sub></strong>). If that condition does not hold, then the ledger creation operation will fail.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="ensembles">Ensembles<a href="#ensembles" class="hash-link" aria-label="Direct link to Ensembles" title="Direct link to Ensembles"></a></h3><p>When a ledger is created, <strong>E</strong> bookies are chosen for the entries of that ledger. The bookies are the initial ensemble of the ledger. A ledger can have multiple ensembles, but an entry has only one ensemble. Changes in the ensemble involve a new fragment being added to the ledger.</p><p>Take the following example. In this ledger, with ensemble size of 3, there are two fragments and thus two ensembles, one starting at entry 0, the second at entry 12. The second ensemble differs from the first only by its first element. This could be because bookie1 has failed and therefore had to be replaced.</p><table><thead><tr><th align="left">First entry</th><th align="left">Bookies</th></tr></thead><tbody><tr><td align="left">0</td><td align="left">B1, B2, B3</td></tr><tr><td align="left">12</td><td align="left">B4, B2, B3</td></tr></tbody></table><h3 class="anchor anchorWithStickyNavbar_LWe7" id="write-quorums">Write quorums<a href="#write-quorums" class="hash-link" aria-label="Direct link to Write quorums" title="Direct link to Write quorums"></a></h3><p>Each entry in the log is written to <strong>Q<sub>w</sub></strong> nodes. This is considered the write quorum for that entry. The write quorum is the subsequence of the ensemble, <strong>Q<sub>w</sub></strong> in length, and starting at the bookie at index (entryid % <strong>E</strong>).</p><p>For example, in a ledger of <strong>E</strong> = 4, <strong>Q<sub>w</sub></strong> = 3, and <strong>Q<sub>a</sub></strong> = 2, with an ensemble consisting of B1, B2, B3, and B4, the write quorums for the first 6 entries will be:</p><table><thead><tr><th align="left">Entry</th><th align="left">Write quorum</th></tr></thead><tbody><tr><td align="left">0</td><td align="left">B1, B2, B3</td></tr><tr><td align="left">1</td><td align="left">B2, B3, B4</td></tr><tr><td align="left">2</td><td align="left">B3, B4, B1</td></tr><tr><td align="left">3</td><td align="left">B4, B1, B2</td></tr><tr><td align="left">4</td><td align="left">B1, B2, B3</td></tr><tr><td align="left">5</td><td align="left">B2, B3, B4</td></tr></tbody></table><p>There are only <strong>E</strong> distinct write quorums in any ensemble. If <strong>Q<sub>w</sub></strong> = <strong>E</strong>, then there is only one, as no striping occurs.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="ack-quorums">Ack quorums<a href="#ack-quorums" class="hash-link" aria-label="Direct link to Ack quorums" title="Direct link to Ack quorums"></a></h3><p>The ack quorum for an entry is any subset of the write quorum of size <strong>Q<sub>a</sub></strong>. If <strong>Q<sub>a</sub></strong> bookies acknowledge an entry, it means it has been fully replicated.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="guarantees">Guarantees<a href="#guarantees" class="hash-link" aria-label="Direct link to Guarantees" title="Direct link to Guarantees"></a></h3><p>The system can tolerate <strong>Q<sub>a</sub></strong> – 1 failures without data loss.</p><p>Bookkeeper guarantees that:</p><ol><li>All updates to a ledger will be read in the same order as they were written.</li><li>All clients will read the same sequence of updates from the ledger.</li></ol><h2 class="anchor anchorWithStickyNavbar_LWe7" id="writing-to-ledgers">Writing to ledgers<a href="#writing-to-ledgers" class="hash-link" aria-label="Direct link to Writing to ledgers" title="Direct link to Writing to ledgers"></a></h2><p>writer, ensuring that entry ids are sequential is trivial. A bookie acknowledges a write once it has been persisted to disk and is therefore durable. Once <strong>Q<sub>a</sub></strong> bookies from the write quorum acknowledge the write, the write is acknowledged to the client, but only if all entries with lower entry ids in the ledger have already been acknowledged to the client.</p><p>The entry written contains the ledger id, the entry id, the last add confirmed and the payload. The last add confirmed is the last entry which had been acknowledged to the client when this entry was written. Sending this with the entry speeds up recovery of the ledger in the case that the writer crashes.</p><p>Another client can also read entries in the ledger up as far as the last add confirmed, as we guarantee that all entries thus far have been replicated on Qa nodes, and therefore all future readers will be able to also read it. However, to read like this, the ledger should be opened with a non-fencing open. Otherwise, it would kill the writer.</p><p>If a node fails to acknowledge a write, the writer will create a new ensemble by replacing the failed node in the current ensemble. It creates a new fragment with this ensemble, starting from the first message that has not been acknowledged to the client. Creating the new fragment involves making a CAS write to the metadata. If the CAS write fails, someone else has modified something in the ledger metadata. This concurrent modification could have been caused by recovery or rereplication. We reread the metadata. If the state of the ledger is no longer <code>OPEN</code>, we send an error to the client for any outstanding writes. Otherwise, we try to replace the failed node again.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="closing-a-ledger-as-a-writer">Closing a ledger as a writer<a href="#closing-a-ledger-as-a-writer" class="hash-link" aria-label="Direct link to Closing a ledger as a writer" title="Direct link to Closing a ledger as a writer"></a></h3><p>Closing a ledger is straightforward for a writer. The writer makes a CAS write to the metadata, changing the state to <code>CLOSED</code> and setting the last entry of the ledger to the last entry which we have acknowledged to the client.</p><p>If the CAS write fails, it means someone else has modified the metadata. We reread the metadata, and retry closing as long as the state of the ledger is still <code>OPEN</code>. If the state is <code>IN_RECOVERY</code> we send an error to the client. If the state is <code>CLOSED</code> and the last entry is the same as the last entry we have acknowledged to the client, we complete the close operation successfully. If the last entry is different from what we have acknowledged to the client, we send an error to the client.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="closing-a-ledger-as-a-reader">Closing a ledger as a reader<a href="#closing-a-ledger-as-a-reader" class="hash-link" aria-label="Direct link to Closing a ledger as a reader" title="Direct link to Closing a ledger as a reader"></a></h3><p>A reader can also force a ledger to close. Forcing the ledger to close will prevent any writer from adding new entries to the ledger. This is called fencing. This can occur when a writer has crashed or become unavailable, and a new writer wants to take over writing to the log. The new writer must ensure that it has seen all updates from the previous writer, and prevent the previous writer from making any new updates before making any updates of its own.</p><p>To recover a ledger, we first update the state in the metadata to IN_RECOVERY. We then send a fence message to all the bookies in the last fragment of the ledger. When a bookie receives a fence message for a ledger, the fenced state of the ledger is persisted to disk. Once we receive a response from at least (<strong>Q<sub>w</sub></strong> - <strong>Q<sub>a</sub></strong>)+1 bookies from each write quorum in the ensemble, the ledger is fenced.</p><p>By ensuring we have received a response from at last (<strong>Q<sub>w</sub></strong> - <strong>Q<sub>a</sub></strong>) + 1 bookies in each write quorum, we ensure that, if the old writer is alive and tries to add a new entry there will be no write quorum in which Qa bookies will accept the write. If the old writer tries to update the ensemble, it will fail on the CAS metadata write, and then see that the ledger is in IN_RECOVERY state, and that it therefore shouldn’t try to write to it.</p><p>The old writer will be able to write entries to individual bookies (we can’t guarantee that the fence message reaches all bookies), but as it will not be able reach ack quorum, it will not be able to send a success response to its client. The client will get a LedgerFenced error instead.</p><p>It is important to note that when you get a ledger fenced message for an entry, it doesn’t mean that the entry has not been written. It means that the entry may or may not have been written, and this can only be determined after the ledger is recovered. In effect, LedgerFenced should be treated like a timeout.</p><p>Once the ledger is fenced, recovery can begin. Recovery means finding the last entry of the ledger and closing the ledger. To find the last entry of the ledger, the client asks all bookies for the highest last add confirmed value they have seen. It waits until it has received a response at least (<strong>Q<sub>w</sub></strong> - <strong>Q<sub>a</sub></strong>) + 1 bookies from each write quorum, and takes the highest response as the entry id to start reading forward from. It then starts reading forward in the ledger, one entry at a time, replicating all entries it sees to the entire write quorum for that entry. Once it can no longer read any more entries, it updates the state in the metadata to <code>CLOSED</code>, and sets the last entry of the ledger to the last entry it wrote. Multiple readers can try to recovery a ledger at the same time, but as the metadata write is CAS they will all converge on the same last entry of the ledger.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="ledgers-to-logs">Ledgers to logs<a href="#ledgers-to-logs" class="hash-link" aria-label="Direct link to Ledgers to logs" title="Direct link to Ledgers to logs"></a></h2><p>In BookKeeper, ledgers can be used to build a replicated log for your system. All guarantees provided by BookKeeper are at the ledger level. Guarantees on the whole log can be built using the ledger guarantees and any consistent datastore with a compare-and-swap (CAS) primitive. BookKeeper uses ZooKeeper as the datastore but others could theoretically be used.</p><p>A log in BookKeeper is built from some number of ledgers, with a fixed order. A ledger represents a single segment of the log. A ledger could be the whole period that one node was the leader, or there could be multiple ledgers for a single period of leadership. However, there can only ever be one leader that adds entries to a single ledger. Ledgers cannot be reopened for writing once they have been closed/recovered.</p><blockquote><p>BookKeeper does <em>not</em> provide leader election. You must use a system like ZooKeeper for this.</p></blockquote><p>In many cases, leader election is really leader suggestion. Multiple nodes could think that they are leader at any one time. It is the job of the log to guarantee that only one can write changes to the system.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="opening-a-log">Opening a log<a href="#opening-a-log" class="hash-link" aria-label="Direct link to Opening a log" title="Direct link to Opening a log"></a></h3><p>Once a node thinks it is leader for a particular log, it must take the following steps:</p><ol><li>Read the list of ledgers for the log</li><li>Fence the last two ledgers in the list. Two ledgers are fenced because the writer may be writing to the second-to-last ledger while adding the last ledger to the list.</li><li>Create a new ledger</li><li>Add the new ledger to the ledger list</li><li>Write the new ledger back to the datastore using a CAS operation</li></ol><p>The fencing in step 2 and the CAS operation in step 5 prevent two nodes from thinking that they have leadership at any one time.</p><p>The CAS operation will fail if the list of ledgers has changed between reading it and writing back the new list. When the CAS operation fails, the leader must start at step 1 again. Even better, they should check that they are in fact still the leader with the system that is providing leader election. The protocol will work correctly without this step, though it will be able to make very little progress if two nodes think they are leader and are duelling for the log.</p><p>The node must not serve any writes until step 5 completes successfully.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="rolling-ledgers">Rolling ledgers<a href="#rolling-ledgers" class="hash-link" aria-label="Direct link to Rolling ledgers" title="Direct link to Rolling ledgers"></a></h3><p>The leader may wish to close the current ledger and open a new one every so often. Ledgers can only be deleted as a whole. If you don&#x27;t roll the log, you won&#x27;t be able to clean up old entries in the log without a leader change. By closing the current ledger and adding a new one, the leader allows the log to be truncated whenever that data is no longer needed. The steps for rolling the log is similar to those for creating a new ledger.</p><ol><li>Create a new ledger</li><li>Add the new ledger to the ledger list</li><li>Write the new ledger list to the datastore using CAS</li><li>Close the previous ledger</li></ol><p>By deferring the closing of the previous ledger until step 4, we can continue writing to the log while we perform metadata update operations to add the new ledger. This is safe as long as you fence the last 2 ledgers when acquiring leadership.</p></div></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Docs pages navigation"><a class="pagination-nav__link pagination-nav__link--prev" href="/docs/4.9.2/security/zookeeper"><div class="pagination-nav__sublabel">Previous</div><div class="pagination-nav__label">ZooKeeper Authentication</div></a><a class="pagination-nav__link pagination-nav__link--next" href="/docs/4.9.2/reference/config"><div class="pagination-nav__sublabel">Next</div><div class="pagination-nav__label">BookKeeper configuration</div></a></nav></div></div><div class="col col--3"><div class="tableOfContents_bqdL thin-scrollbar theme-doc-toc-desktop"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#ledgers" class="table-of-contents__link toc-highlight">Ledgers</a><ul><li><a href="#ledger-metadata" class="table-of-contents__link toc-highlight">Ledger metadata</a></li><li><a href="#ensembles" class="table-of-contents__link toc-highlight">Ensembles</a></li><li><a href="#write-quorums" class="table-of-contents__link toc-highlight">Write quorums</a></li><li><a href="#ack-quorums" class="table-of-contents__link toc-highlight">Ack quorums</a></li><li><a href="#guarantees" class="table-of-contents__link toc-highlight">Guarantees</a></li></ul></li><li><a href="#writing-to-ledgers" class="table-of-contents__link toc-highlight">Writing to ledgers</a><ul><li><a href="#closing-a-ledger-as-a-writer" class="table-of-contents__link toc-highlight">Closing a ledger as a writer</a></li><li><a href="#closing-a-ledger-as-a-reader" class="table-of-contents__link toc-highlight">Closing a ledger as a reader</a></li></ul></li><li><a href="#ledgers-to-logs" class="table-of-contents__link toc-highlight">Ledgers to logs</a><ul><li><a href="#opening-a-log" class="table-of-contents__link toc-highlight">Opening a log</a></li><li><a href="#rolling-ledgers" class="table-of-contents__link toc-highlight">Rolling ledgers</a></li></ul></li></ul></div></div></div></div></main></div></div><footer class="footer footer--dark"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">Documentation</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/docs/overview">Overview</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/getting-started/installation">Getting started</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/deployment/manual">Deployment</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/admin/bookies">Administration</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/api/overview">API</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/security/overview">Security</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/development/protocol">Development</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/reference/config">Reference</a></li></ul></div><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/community/mailing-lists">Mailing lists</a></li><li class="footer__item"><a class="footer__link-item" href="/community/slack">Slack</a></li><li class="footer__item"><a href="https://github.com/apache/bookkeeper" target="_blank" rel="noopener noreferrer" class="footer__link-item">Github<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://twitter.com/asfbookkeeper" target="_blank" rel="noopener noreferrer" class="footer__link-item">Twitter<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><div class="col footer__col"><div class="footer__title">Project</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/project/who">Who are we?</a></li><li class="footer__item"><a class="footer__link-item" href="/project/bylaws">Bylaws</a></li><li class="footer__item"><a href="https://apache.org/licenses" target="_blank" rel="noopener noreferrer" class="footer__link-item">License<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a class="footer__link-item" href="/project/privacy">Privacy policy</a></li><li class="footer__item"><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Sponsorship<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Thanks<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright"><footer class="footer">
<div class="container">
<div class="content has-text-centered">
<p>
Copyright &copy; 2016 - 2024 <a href="https://www.apache.org/">The Apache Software Foundation</a>,<br> licensed under the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, version 2.0</a>.
</p>
<p>
Apache BookKeeper, BookKeeper®, Apache®, the Apache feature logo, and the Apache BookKeeper logo are either registered trademarks or trademarks of The Apache Software Foundation.
</p>
</div>
</div>
</footer>
</div></div></div></footer></div>
<script src="/assets/js/runtime~main.1c369ecb.js"></script>
<script src="/assets/js/main.812b2dbb.js"></script>
</body>
</html>