blob: 4d4cb58eb8288e1a527bd96521001fabd9fff2a0 [file] [log] [blame]
<!doctype html>
<html lang="en" dir="ltr" class="mdx-wrapper mdx-page plugin-pages plugin-id-default">
<head>
<meta charset="UTF-8">
<meta name="generator" content="Docusaurus v2.4.0">
<title data-rh="true">BP-46: Running without the journal | Apache BookKeeper</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://bookkeeper.apache.org/bps/BP-46-run-without-journal"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docusaurus_tag" content="default"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docsearch:docusaurus_tag" content="default"><meta data-rh="true" property="og:title" content="BP-46: Running without the journal | Apache BookKeeper"><meta data-rh="true" name="description" content="Motivation"><meta data-rh="true" property="og:description" content="Motivation"><link data-rh="true" rel="icon" href="/img/favicon.ico"><link data-rh="true" rel="canonical" href="https://bookkeeper.apache.org/bps/BP-46-run-without-journal"><link data-rh="true" rel="alternate" href="https://bookkeeper.apache.org/bps/BP-46-run-without-journal" hreflang="en"><link data-rh="true" rel="alternate" href="https://bookkeeper.apache.org/bps/BP-46-run-without-journal" hreflang="x-default"><link rel="stylesheet" href="/assets/css/styles.49914aab.css">
<link rel="preload" href="/assets/js/runtime~main.793d926f.js" as="script">
<link rel="preload" href="/assets/js/main.c5d52852.js" as="script">
</head>
<body class="navigation-with-keyboard">
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}return t}()||function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}()</script><div id="__docusaurus">
<div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#docusaurus_skipToContent_fallback">Skip to main content</a></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><div class="navbar__logo"><img src="/img/bk-logo.svg" alt="Apache Bookkeeper" class="themedImage_ToTc themedImage--light_HNdA"><img src="/img/bk-logo.svg" alt="Apache Bookkeeper" class="themedImage_ToTc themedImage--dark_i4oU"></div><b class="navbar__title text--truncate">Apache BookKeeper</b></a><a class="navbar__item navbar__link" href="/docs/overview/">Documentation</a><div class="navbar__item dropdown dropdown--hoverable"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Community</a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/community/mailing-lists">Mailing lists</a></li><li><a class="dropdown__link" href="/community/slack">Slack</a></li><li><a href="https://github.com/apache/bookkeeper/issues" target="_blank" rel="noopener noreferrer" class="dropdown__link">Github issues<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li><a class="dropdown__link" href="/community/releases">Release management</a></li><li><a class="dropdown__link" href="/community/meeting">Community meetings</a></li><li><a class="dropdown__link" href="/community/contributing">Contribution guide</a></li><li><a class="dropdown__link" href="/community/coding-guide">Coding guide</a></li><li><a class="dropdown__link" href="/community/testing">Testing guide</a></li><li><a class="dropdown__link" href="/community/issue-report">Issue report guide</a></li><li><a class="dropdown__link" href="/community/release-guide">Release guide</a></li><li><a class="dropdown__link" href="/community/presentations">Presentations</a></li><li><a class="dropdown__link" href="/community/bookkeeper-proposals">BookKeeper proposals (BP)</a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Project</a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/project/who">Who are we?</a></li><li><a class="dropdown__link" href="/project/bylaws">Bylaws</a></li><li><a href="https://apache.org/licenses" target="_blank" rel="noopener noreferrer" class="dropdown__link">License<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li><a class="dropdown__link" href="/project/privacy">Privacy policy</a></li><li><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Sponsorship<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Thanks<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div></div><div class="navbar__items navbar__items--right"><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a class="navbar__link" aria-haspopup="true" aria-expanded="false" role="button" href="/docs/overview/">4.16.5</a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/docs/next/overview/">Next</a></li><li><a class="dropdown__link" href="/docs/overview/">4.16.5</a></li><li><a class="dropdown__link" href="/docs/4.15.5/overview/">4.15.5</a></li><li><a class="dropdown__link" href="/docs/4.14.8/overview/">4.14.8</a></li><li><a class="dropdown__link" href="/docs/4.13.0/overview/">4.13.0</a></li><li><a class="dropdown__link" href="/docs/4.12.1/overview/">4.12.1</a></li><li><a class="dropdown__link" href="/docs/4.11.1/overview/">4.11.1</a></li><li><a class="dropdown__link" href="/docs/4.10.0/overview/">4.10.0</a></li><li><a class="dropdown__link" href="/docs/4.9.2/overview/">4.9.2</a></li><li><a class="dropdown__link" href="/docs/4.8.2/overview/">4.8.2</a></li><li><a class="dropdown__link" href="/docs/4.7.3/overview/">4.7.3</a></li><li><a class="dropdown__link" href="/docs/4.6.2/overview/">4.6.2</a></li><li><a class="dropdown__link" href="/docs/4.5.1/overview/">4.5.1</a></li></ul></div><a class="navbar__item navbar__link" href="/releases">Download</a><div class="toggle_vylO colorModeToggle_DEke"><button class="clean-btn toggleButton_gllP toggleButtonDisabled_aARS" type="button" disabled="" title="Switch between dark and light mode (currently light mode)" aria-label="Switch between dark and light mode (currently light mode)" aria-live="polite"><svg viewBox="0 0 24 24" width="24" height="24" class="lightToggleIcon_pyhR"><path fill="currentColor" d="M12,9c1.65,0,3,1.35,3,3s-1.35,3-3,3s-3-1.35-3-3S10.35,9,12,9 M12,7c-2.76,0-5,2.24-5,5s2.24,5,5,5s5-2.24,5-5 S14.76,7,12,7L12,7z M2,13l2,0c0.55,0,1-0.45,1-1s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S1.45,13,2,13z M20,13l2,0c0.55,0,1-0.45,1-1 s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S19.45,13,20,13z M11,2v2c0,0.55,0.45,1,1,1s1-0.45,1-1V2c0-0.55-0.45-1-1-1S11,1.45,11,2z M11,20v2c0,0.55,0.45,1,1,1s1-0.45,1-1v-2c0-0.55-0.45-1-1-1C11.45,19,11,19.45,11,20z M5.99,4.58c-0.39-0.39-1.03-0.39-1.41,0 c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0s0.39-1.03,0-1.41L5.99,4.58z M18.36,16.95 c-0.39-0.39-1.03-0.39-1.41,0c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0c0.39-0.39,0.39-1.03,0-1.41 L18.36,16.95z M19.42,5.99c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06c-0.39,0.39-0.39,1.03,0,1.41 s1.03,0.39,1.41,0L19.42,5.99z M7.05,18.36c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06 c-0.39,0.39-0.39,1.03,0,1.41s1.03,0.39,1.41,0L7.05,18.36z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" class="darkToggleIcon_wfgR"><path fill="currentColor" d="M9.37,5.51C9.19,6.15,9.1,6.82,9.1,7.5c0,4.08,3.32,7.4,7.4,7.4c0.68,0,1.35-0.09,1.99-0.27C17.45,17.19,14.93,19,12,19 c-3.86,0-7-3.14-7-7C5,9.07,6.81,6.55,9.37,5.51z M12,3c-4.97,0-9,4.03-9,9s4.03,9,9,9s9-4.03,9-9c0-0.46-0.04-0.92-0.1-1.36 c-0.98,1.37-2.58,2.26-4.4,2.26c-2.98,0-5.4-2.42-5.4-5.4c0-1.81,0.89-3.42,2.26-4.4C12.92,3.04,12.46,3,12,3L12,3z"></path></svg></button></div><div class="searchBox_ZlJk"></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0"><main class="container container--fluid margin-vert--lg"><div class="row mdxPageWrapper_j9I6"><div class="col col--8"><article><h1>BP-46: Running without the journal</h1><h3 class="anchor anchorWithStickyNavbar_LWe7" id="motivation">Motivation<a href="#motivation" class="hash-link" aria-label="Direct link to Motivation" title="Direct link to Motivation"></a></h3><p>The journal allows for fast add operations that provide strong data safety guarantees. An add operation is only acked to a client once written to the journal and an fsync performed. This however means that every entry must be written twice: once to the journal and once to an entry log file.</p><p>This double write increases the cost of ownership as more disks must be provisioned to service requests and makes disk provisioning more complex (separating journal from entry log writes onto separate disks). Running without the journal would halve the disk IO required (ignoring indexes) thereby reducing costs and simplifying provisioning.</p><p>However, running without the journal would introduce data consistency problems as the BookKeeper Replication Protocol requires that all writes are persistent for correctness. Running without the journal introduces the possibility of lost writes. In order to continue to offer strong data safety and support running without the journal, changes to the protocol are required.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="a-note-on-response-codes">A note on Response Codes<a href="#a-note-on-response-codes" class="hash-link" aria-label="Direct link to A note on Response Codes" title="Direct link to A note on Response Codes"></a></h3><p>The following categories are relevant:</p><ul><li>Positive: OK</li><li>Explicit Negative: NoSuchEntry/NoSuchLedger</li><li>Unknown: Any other non-success response that is not an explicit negative.</li></ul><p>For correctness explicit negatives must be treated differently than other errors.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="a-note-on-quorums">A note on Quorums<a href="#a-note-on-quorums" class="hash-link" aria-label="Direct link to A note on Quorums" title="Direct link to A note on Quorums"></a></h3><p>In order to explain the protocol changes, it is useful to first consider how quorums are used for safety. We have the following relevant quorums:</p><ul><li>Single bookie (S)</li><li>Ack quorum (AQ)</li><li>Write quorum (WQ)</li><li>Quorum Coverage (QC) where QC = (WQ - AQ) + 1</li><li>Ensemble Coverage (EC) where EC = (E - AQ) + 1</li><li>Whole Ensemble</li></ul><p>Quorum Coverage (QC) and Ensemble Coverage (EC) are both defined by the following, only the cohorts differ: </p><ul><li>A given property is satisfied by at least one bookie from every possible ack quorum within the cohort.</li><li>There exists no ack quorum of bookies that do not satisfy the property within the cohort.</li></ul><p>For QC, the cohort is the writeset of a given entry, and therefore QC is only used when we need guarantees regarding a single entry. For EC, the cohort is the ensemble of bookies of the current fragment. EC is required when we need a guarantee across an entire fragment.</p><p>For example:</p><ul><li>For fencing, we need to ensure that no AQ of bookies is unfenced before starting the read/write phase of recovery. This is true once EC successful fencing responses have been received.</li><li>For a recovery read, a read is only negative once we know that no AQ of bookies could exist that might have the entry. Doing otherwise could truncate committed entries from a ledger. A read is negative once NoSuchEntry responses reach QC.</li></ul><p>Different protocol actions require different quorums:</p><ul><li>Add entry: AQ success responses</li><li>Read entry:<ul><li>Positive when positive response from a single bookie</li><li>Negative when explicit negative from all bookies</li><li>Unknown: when at least one unknown and no positive from all bookies</li></ul></li><li>Fencing phase, LAC read (sent to ensemble of current fragment):<ul><li>Complete when EC positive responses</li><li>Unknown (cannot make progress) when AQ unknown responses (fencing LAC reads cannot cause an explicit negative as fencing creates the ledger on the bookie if it doesn’t exist)</li></ul></li><li>Recovery read (sent to writeset of entry):<ul><li>Entry recoverable: AQ positive read responses</li><li>Entry Unrecoverable: QC negative read responses</li><li>Unknown (cannot make progress):<ul><li>QC unknown responses or</li><li>All responses received, but not enough for either a positive or negative</li></ul></li></ul></li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="impact-of-undetected-data-loss-on-consistency">Impact of Undetected Data Loss on Consistency<a href="#impact-of-undetected-data-loss-on-consistency" class="hash-link" aria-label="Direct link to Impact of Undetected Data Loss on Consistency" title="Direct link to Impact of Undetected Data Loss on Consistency"></a></h3><p>The ledger recovery process assumes that ledger entries are never arbitrarily lost. In the event of the loss of an entry, the recovery process can:</p><ul><li>allow the original client to keep writing entries to a ledger that has just been fenced and closed, thus losing those entries </li><li>allow the recovery client to truncate the ledger too soon, closing it with a last entry id lower than that of previously acknowledged entries - thus losing data.</li></ul><p>The following scenarios assume existing behaviour but simply skipping the writing of entries and fencing ops to the journal.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="scenario-1---lost-fenced-status-allows-writes-after-ledger-close">Scenario 1 - Lost Fenced Status Allows Writes After Ledger Close<a href="#scenario-1---lost-fenced-status-allows-writes-after-ledger-close" class="hash-link" aria-label="Direct link to Scenario 1 - Lost Fenced Status Allows Writes After Ledger Close" title="Direct link to Scenario 1 - Lost Fenced Status Allows Writes After Ledger Close"></a></h3><ol><li>3 bookies, B1, B2 &amp; B3</li><li>2 clients, C1 &amp; C2</li><li>1 ledger, L1, with e3:w3:a2 configuration.</li><li>C1 writes entry E1 to L1. The write hits all three bookies.</li><li>C1 hangs for an indeterminate length of time. </li><li>C2 sees that C1 is unresponsive, and assumes it has failed. C2 tries to recover the ledger L1.</li><li>L1 sends a fencing message to all bookies in the ensemble.</li><li>The fencing message succeeds in arriving at B1 &amp; B2 and is acknowledged by both. The message to B3 is lost. </li><li>C2 sees that at least one bookie in each possible ack quorum has acknowledged the fencing message (EC threshold reached), so continues with the read/write phase of recovery, finding that E1 is the last entry of the ledger, and committing the endpoint of the ledger in the ZK.</li><li>B2 crashes and boots again with all unflushed operations lost. </li><li>C1 wakes up and writes entry E2 to all bookies. B2 &amp; B3 acknowledge positively, so C1 considers E2 as persisted. B1 rejects the message as the ledger is fenced, but since ack quorum is 2, B2 &amp; B3 are enough to consider the entry written.</li></ol><h3 class="anchor anchorWithStickyNavbar_LWe7" id="scenario-2---recovery-truncates-previously-acknowledged-entries">Scenario 2 - Recovery Truncates Previously Acknowledged Entries<a href="#scenario-2---recovery-truncates-previously-acknowledged-entries" class="hash-link" aria-label="Direct link to Scenario 2 - Recovery Truncates Previously Acknowledged Entries" title="Direct link to Scenario 2 - Recovery Truncates Previously Acknowledged Entries"></a></h3><ol><li>C1 adds E0 to B1, B2, B3</li><li>B1 and B3 confirms. C1 confirms the write to its client.</li><li>C2 starts recovery</li><li>B2 fails to respond. C1 tries to change ensemble but gets a metadata version conflict.</li><li>B1 crashes and restarts, has lost E0 (undetected)</li><li>C2 fences the ledger on B1, B2, B3</li><li>C2 sends Read E0 to B1, B2, B3</li><li>B1 responds with NoSuchEntry</li><li>B2 responds with NoSuchEntry</li><li>QC negative response threshold reached. C2 closes the ledger as empty. Losing E0.</li></ol><p>The problem is that without the journal (and syncing to entry log files before acknowledgement) a bookie can:</p><ul><li>lose the fenced status of a previously existing ledger</li><li>respond with an explicit negative even though it had previously seen an entry. </li></ul><p>Undetected data loss could occur when running without the journal. Bookie crashes and loses most recent entries and fence statuses that had not yet been written and synced to disk.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="a-note-on-cookies">A note on cookies<a href="#a-note-on-cookies" class="hash-link" aria-label="Direct link to A note on cookies" title="Direct link to A note on cookies"></a></h3><p>Cookies play an essential part in the bookkeeper replication protocol, but their purpose is often unclear. </p><p>When a bookie boots for the first time, it generates a cookie. The cookie encapsulates the identity of the bookie and should be considered immutable. This identity contains the advertised address of the bookie, the disks used for the journal, index, and ledger storage, and a unique ID. The bookie writes the cookie to ZK and each of the disks in use. On all subsequent boots, if the cookie is missing from any of these places, the bookie fails to boot.</p><p>The absence of a disk&#x27;s cookie implies that the rest of the disk&#x27;s data is also missing. Cookie validation is performed on boot-up and prevents the boot from succeeding if the validation fails, thus preventing the bookie starting with undetected data loss. </p><p>This proposal improves the cookie mechanism by automating the resolution of a cookie validation error which currently requires human intervention to resolve. This automated feature will be configurable (enabled or disabled) and additionally a CLI command will be made available so an admin can manually run the operation (for when this feature is disabled - likely to be the default). </p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="proposed-changes">Proposed Changes<a href="#proposed-changes" class="hash-link" aria-label="Direct link to Proposed Changes" title="Direct link to Proposed Changes"></a></h3><p>The proposed changes involve:</p><ul><li>A new config that controls whether add operations go into the journal</li><li>Detecting possible data loss on boot</li><li>Prevent explicit negative responses when data loss may have occurred, instead reply with unknown code, until data is repaired.</li><li>Repair data loss</li><li>Auto fix cookies (with new config to enable or disable the feature)</li><li>CLI command for admin to run fix cookie logic in the case that auto fix is disabled</li></ul><p>In these proposed changes, when running &quot;without&quot; the journal, the journal still exists, but add entry operations skip the addition to the journal. The boot-up sequence still replays the journal.</p><p>Add operations can be configured to be written to the journal or not based on the config <code>journalWriteData</code>. When set to <code>false</code>, add operations are not added to the journal.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="detecting-data-loss-on-boot">Detecting Data Loss On Boot<a href="#detecting-data-loss-on-boot" class="hash-link" aria-label="Direct link to Detecting Data Loss On Boot" title="Direct link to Detecting Data Loss On Boot"></a></h3><p>The new mechanism for data loss detection is checking for an unclean shutdown (aka a crash or abrupt termination of the bookie). When an unclean shutdown is detected further measures are taken to prevent data inconsistency.</p><p>The unclean shutdown detection will consist of setting a bit in the index on start-up and clearing it on shutdown. On subsequent start-up, the value will be checked and if it remains set, it knows that the prior shutdown was not clean.</p><p>Cookie validation will continue to be used to detect booting with one or more missing or empty disks (that once existed and contained a cookie).</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="protection-mechanism">Protection Mechanism<a href="#protection-mechanism" class="hash-link" aria-label="Direct link to Protection Mechanism" title="Direct link to Protection Mechanism"></a></h3><p>Once possible data loss has been detected the following protection mechanism is carried out during the boot:</p><ul><li>Fencing: Ledger metadata for all ledgers of the cluster are obtained and all those ledgers are fenced on this bookie. This prevents data loss scenario 1.</li><li>Limbo: All open ledgers are placed in the limbo status. Limbo ledgers can serve read requests, but never respond with an explicit negative, all explicit negatives are converted to unknowns (with the use of a new code EUNKNOWN).</li><li>Recovery: All open ledgers are opened and recovered.</li><li>Repair: Each ledger is scanned and any missing entries are sourced from peers.</li><li>Limbo ledgers that have been repaired have their limbo status cleared.</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="the-full-boot-up-sequence">The Full Boot-Up Sequence<a href="#the-full-boot-up-sequence" class="hash-link" aria-label="Direct link to The Full Boot-Up Sequence" title="Direct link to The Full Boot-Up Sequence"></a></h3><p>This mechanism of limbo ledgers and self-repair needs to work hand-in hand with the cookie validation check. Combining everything together:</p><p>On boot:</p><ol><li>Check for unclean shutdown and validate cookies</li><li>Fetch the metadata for all ledgers in the cluster from ZK where the bookie is a member of its ensemble.</li><li>Phase one:<ul><li>If the cookie check fails or unclean shutdown is detected:<ul><li>For each non-closed ledger, mark the ledger as fenced and in-limbo in the index.</li><li>Update the cookie if it was a cookie failure</li></ul></li></ul></li><li>Phase two<ul><li>For each ledger<ol><li>If the ledger is in-limbo, open and recover the ledger.</li><li>Check that all entries assigned to this bookie exist in the index.</li><li>For any entries that are missing, copy from another bookie.</li><li>Clear limbo status if set</li></ol></li></ul></li></ol><p>When booting a bookie with empty disks, only phase one needs to be complete before the bookie makes itself available for client requests. </p><p>In phase one, if the cookie check fails, we mark all non-closed ledgers as “fenced”. This prevents any future writes to these ledgers on this bookie. This solves the problem of an empty bookie disk allowing writes to closed ledgers (Scenario 1).</p><p>Given that the algorithm solves both the issues that cookies are designed to solve, we can now allow the bookie to update its cookie without operator intervention. </p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="formal-verification-of-proposed-changes">Formal Verification of Proposed Changes<a href="#formal-verification-of-proposed-changes" class="hash-link" aria-label="Direct link to Formal Verification of Proposed Changes" title="Direct link to Formal Verification of Proposed Changes"></a></h3><p>The use of the limbo status and fencing of all ledgers on boot-up when detecting an unclean shutdown has been modelled in TLA+. It does not model the whole boot-up sequence but a simplified version with only fencing and limbo status. </p><p>The specification models the lifetime of a single ledger and includes a single bookie crashing, losing all data. The specification allows the testing of:</p><ul><li>enabling/disabling the fencing</li><li>enabling/disabling the limbo status.</li></ul><p>When running without limbo status, the model checker finds the counterexample of scenario 2. When running without fencing of all ledgers, the model checker finds the counterexample of scenario 1. When running with both enabled, the model checker finds no invariant violation.</p><p>The specification can be found here: <a href="https://github.com/Vanlightly/bookkeeper-tlaplus" target="_blank" rel="noopener noreferrer">https://github.com/Vanlightly/bookkeeper-tlaplus</a></p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="public-interfaces">Public Interfaces<a href="#public-interfaces" class="hash-link" aria-label="Direct link to Public Interfaces" title="Direct link to Public Interfaces"></a></h3><ul><li>Return codes. Addition of a new return code: <code>EUNKNOWN</code> which is returned when a read hits an in-limbo ledger and that ledger not contain the requested entry id.</li><li>Bookie ledger metadata format (LedgerData). Addition of the limbo status.</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="compatibility-deprecation-and-migration-plan">Compatibility, Deprecation, and Migration Plan<a href="#compatibility-deprecation-and-migration-plan" class="hash-link" aria-label="Direct link to Compatibility, Deprecation, and Migration Plan" title="Direct link to Compatibility, Deprecation, and Migration Plan"></a></h3><ul><li>Because we only skip the journal for add operations, there is no impact on existing deployments. When a bookie is booted with the new version, and <code>journalWriteData</code> is set to false, the journal is still replayed on boot-up causing no risk of data loss in the transition.</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="test-plan">Test Plan<a href="#test-plan" class="hash-link" aria-label="Direct link to Test Plan" title="Direct link to Test Plan"></a></h3><ul><li>There is confidence in the design due to the modelling in TLA+ but this model does not include the full boot sequence.</li><li>The implementation will require aggressive chaos testing to ensure correctness.</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="rejected-alternatives">Rejected Alternatives<a href="#rejected-alternatives" class="hash-link" aria-label="Direct link to Rejected Alternatives" title="Direct link to Rejected Alternatives"></a></h3><p>Entry Log Per Ledger (ELPL) without the journal. From our performance testing of ELPL, performance degrades significantly with a large number of active ledgers and syncing to disk multiple times a second (which is required to offer low latency writes).</p><p>In the future this design could be extended to offer ledger level configuration of journal use. The scope of this BP is limited to cluster level.</p></article></div><div class="col col--2"><div class="tableOfContents_bqdL thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#motivation" class="table-of-contents__link toc-highlight">Motivation</a></li><li><a href="#a-note-on-response-codes" class="table-of-contents__link toc-highlight">A note on Response Codes</a></li><li><a href="#a-note-on-quorums" class="table-of-contents__link toc-highlight">A note on Quorums</a></li><li><a href="#impact-of-undetected-data-loss-on-consistency" class="table-of-contents__link toc-highlight">Impact of Undetected Data Loss on Consistency</a></li><li><a href="#scenario-1---lost-fenced-status-allows-writes-after-ledger-close" class="table-of-contents__link toc-highlight">Scenario 1 - Lost Fenced Status Allows Writes After Ledger Close</a></li><li><a href="#scenario-2---recovery-truncates-previously-acknowledged-entries" class="table-of-contents__link toc-highlight">Scenario 2 - Recovery Truncates Previously Acknowledged Entries</a></li><li><a href="#a-note-on-cookies" class="table-of-contents__link toc-highlight">A note on cookies</a></li><li><a href="#proposed-changes" class="table-of-contents__link toc-highlight">Proposed Changes</a></li><li><a href="#detecting-data-loss-on-boot" class="table-of-contents__link toc-highlight">Detecting Data Loss On Boot</a></li><li><a href="#protection-mechanism" class="table-of-contents__link toc-highlight">Protection Mechanism</a></li><li><a href="#the-full-boot-up-sequence" class="table-of-contents__link toc-highlight">The Full Boot-Up Sequence</a></li><li><a href="#formal-verification-of-proposed-changes" class="table-of-contents__link toc-highlight">Formal Verification of Proposed Changes</a></li><li><a href="#public-interfaces" class="table-of-contents__link toc-highlight">Public Interfaces</a></li><li><a href="#compatibility-deprecation-and-migration-plan" class="table-of-contents__link toc-highlight">Compatibility, Deprecation, and Migration Plan</a></li><li><a href="#test-plan" class="table-of-contents__link toc-highlight">Test Plan</a></li><li><a href="#rejected-alternatives" class="table-of-contents__link toc-highlight">Rejected Alternatives</a></li></ul></div></div></div></main></div><footer class="footer footer--dark"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">Documentation</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/docs/overview">Overview</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/getting-started/installation">Getting started</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/deployment/manual">Deployment</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/admin/bookies">Administration</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/api/overview">API</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/security/overview">Security</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/development/protocol">Development</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/reference/config">Reference</a></li></ul></div><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/community/mailing-lists">Mailing lists</a></li><li class="footer__item"><a class="footer__link-item" href="/community/slack">Slack</a></li><li class="footer__item"><a href="https://github.com/apache/bookkeeper" target="_blank" rel="noopener noreferrer" class="footer__link-item">Github<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://twitter.com/asfbookkeeper" target="_blank" rel="noopener noreferrer" class="footer__link-item">Twitter<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><div class="col footer__col"><div class="footer__title">Project</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/project/who">Who are we?</a></li><li class="footer__item"><a class="footer__link-item" href="/project/bylaws">Bylaws</a></li><li class="footer__item"><a href="https://apache.org/licenses" target="_blank" rel="noopener noreferrer" class="footer__link-item">License<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a class="footer__link-item" href="/project/privacy">Privacy policy</a></li><li class="footer__item"><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Sponsorship<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Thanks<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright"><footer class="footer">
<div class="container">
<div class="content has-text-centered">
<p>
Copyright &copy; 2016 - 2024 <a href="https://www.apache.org/">The Apache Software Foundation</a>,<br> licensed under the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, version 2.0</a>.
</p>
<p>
Apache BookKeeper, BookKeeper®, Apache®, the Apache feature logo, and the Apache BookKeeper logo are either registered trademarks or trademarks of The Apache Software Foundation.
</p>
</div>
</div>
</footer>
</div></div></div></footer></div>
<script src="/assets/js/runtime~main.793d926f.js"></script>
<script src="/assets/js/main.c5d52852.js"></script>
</body>
</html>