blob: e0dfaae7f5cbeaaf1b35a5fbd7f1c215a28aef7b [file] [log] [blame]
<!doctype html>
<html lang="en" dir="ltr" class="mdx-wrapper mdx-page plugin-pages plugin-id-default">
<head>
<meta charset="UTF-8">
<meta name="generator" content="Docusaurus v2.4.0">
<title data-rh="true">BP-31: BookKeeper Durability (Anchor) | Apache BookKeeper</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://bookkeeper.apache.org/bps/BP-31-durability"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docusaurus_tag" content="default"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docsearch:docusaurus_tag" content="default"><meta data-rh="true" property="og:title" content="BP-31: BookKeeper Durability (Anchor) | Apache BookKeeper"><meta data-rh="true" name="description" content="Motivation"><meta data-rh="true" property="og:description" content="Motivation"><link data-rh="true" rel="icon" href="/img/favicon.ico"><link data-rh="true" rel="canonical" href="https://bookkeeper.apache.org/bps/BP-31-durability"><link data-rh="true" rel="alternate" href="https://bookkeeper.apache.org/bps/BP-31-durability" hreflang="en"><link data-rh="true" rel="alternate" href="https://bookkeeper.apache.org/bps/BP-31-durability" hreflang="x-default"><link rel="stylesheet" href="/assets/css/styles.49914aab.css">
<link rel="preload" href="/assets/js/runtime~main.793d926f.js" as="script">
<link rel="preload" href="/assets/js/main.c5d52852.js" as="script">
</head>
<body class="navigation-with-keyboard">
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}return t}()||function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}()</script><div id="__docusaurus">
<div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#docusaurus_skipToContent_fallback">Skip to main content</a></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><div class="navbar__logo"><img src="/img/bk-logo.svg" alt="Apache Bookkeeper" class="themedImage_ToTc themedImage--light_HNdA"><img src="/img/bk-logo.svg" alt="Apache Bookkeeper" class="themedImage_ToTc themedImage--dark_i4oU"></div><b class="navbar__title text--truncate">Apache BookKeeper</b></a><a class="navbar__item navbar__link" href="/docs/overview/">Documentation</a><div class="navbar__item dropdown dropdown--hoverable"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Community</a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/community/mailing-lists">Mailing lists</a></li><li><a class="dropdown__link" href="/community/slack">Slack</a></li><li><a href="https://github.com/apache/bookkeeper/issues" target="_blank" rel="noopener noreferrer" class="dropdown__link">Github issues<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li><a class="dropdown__link" href="/community/releases">Release management</a></li><li><a class="dropdown__link" href="/community/meeting">Community meetings</a></li><li><a class="dropdown__link" href="/community/contributing">Contribution guide</a></li><li><a class="dropdown__link" href="/community/coding-guide">Coding guide</a></li><li><a class="dropdown__link" href="/community/testing">Testing guide</a></li><li><a class="dropdown__link" href="/community/issue-report">Issue report guide</a></li><li><a class="dropdown__link" href="/community/release-guide">Release guide</a></li><li><a class="dropdown__link" href="/community/presentations">Presentations</a></li><li><a class="dropdown__link" href="/community/bookkeeper-proposals">BookKeeper proposals (BP)</a></li></ul></div><div class="navbar__item dropdown dropdown--hoverable"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Project</a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/project/who">Who are we?</a></li><li><a class="dropdown__link" href="/project/bylaws">Bylaws</a></li><li><a href="https://apache.org/licenses" target="_blank" rel="noopener noreferrer" class="dropdown__link">License<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li><a class="dropdown__link" href="/project/privacy">Privacy policy</a></li><li><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Sponsorship<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="dropdown__link">Thanks<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div></div><div class="navbar__items navbar__items--right"><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a class="navbar__link" aria-haspopup="true" aria-expanded="false" role="button" href="/docs/overview/">4.16.5</a><ul class="dropdown__menu"><li><a class="dropdown__link" href="/docs/next/overview/">Next</a></li><li><a class="dropdown__link" href="/docs/overview/">4.16.5</a></li><li><a class="dropdown__link" href="/docs/4.15.5/overview/">4.15.5</a></li><li><a class="dropdown__link" href="/docs/4.14.8/overview/">4.14.8</a></li><li><a class="dropdown__link" href="/docs/4.13.0/overview/">4.13.0</a></li><li><a class="dropdown__link" href="/docs/4.12.1/overview/">4.12.1</a></li><li><a class="dropdown__link" href="/docs/4.11.1/overview/">4.11.1</a></li><li><a class="dropdown__link" href="/docs/4.10.0/overview/">4.10.0</a></li><li><a class="dropdown__link" href="/docs/4.9.2/overview/">4.9.2</a></li><li><a class="dropdown__link" href="/docs/4.8.2/overview/">4.8.2</a></li><li><a class="dropdown__link" href="/docs/4.7.3/overview/">4.7.3</a></li><li><a class="dropdown__link" href="/docs/4.6.2/overview/">4.6.2</a></li><li><a class="dropdown__link" href="/docs/4.5.1/overview/">4.5.1</a></li></ul></div><a class="navbar__item navbar__link" href="/releases">Download</a><div class="toggle_vylO colorModeToggle_DEke"><button class="clean-btn toggleButton_gllP toggleButtonDisabled_aARS" type="button" disabled="" title="Switch between dark and light mode (currently light mode)" aria-label="Switch between dark and light mode (currently light mode)" aria-live="polite"><svg viewBox="0 0 24 24" width="24" height="24" class="lightToggleIcon_pyhR"><path fill="currentColor" d="M12,9c1.65,0,3,1.35,3,3s-1.35,3-3,3s-3-1.35-3-3S10.35,9,12,9 M12,7c-2.76,0-5,2.24-5,5s2.24,5,5,5s5-2.24,5-5 S14.76,7,12,7L12,7z M2,13l2,0c0.55,0,1-0.45,1-1s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S1.45,13,2,13z M20,13l2,0c0.55,0,1-0.45,1-1 s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S19.45,13,20,13z M11,2v2c0,0.55,0.45,1,1,1s1-0.45,1-1V2c0-0.55-0.45-1-1-1S11,1.45,11,2z M11,20v2c0,0.55,0.45,1,1,1s1-0.45,1-1v-2c0-0.55-0.45-1-1-1C11.45,19,11,19.45,11,20z M5.99,4.58c-0.39-0.39-1.03-0.39-1.41,0 c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0s0.39-1.03,0-1.41L5.99,4.58z M18.36,16.95 c-0.39-0.39-1.03-0.39-1.41,0c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0c0.39-0.39,0.39-1.03,0-1.41 L18.36,16.95z M19.42,5.99c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06c-0.39,0.39-0.39,1.03,0,1.41 s1.03,0.39,1.41,0L19.42,5.99z M7.05,18.36c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06 c-0.39,0.39-0.39,1.03,0,1.41s1.03,0.39,1.41,0L7.05,18.36z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" class="darkToggleIcon_wfgR"><path fill="currentColor" d="M9.37,5.51C9.19,6.15,9.1,6.82,9.1,7.5c0,4.08,3.32,7.4,7.4,7.4c0.68,0,1.35-0.09,1.99-0.27C17.45,17.19,14.93,19,12,19 c-3.86,0-7-3.14-7-7C5,9.07,6.81,6.55,9.37,5.51z M12,3c-4.97,0-9,4.03-9,9s4.03,9,9,9s9-4.03,9-9c0-0.46-0.04-0.92-0.1-1.36 c-0.98,1.37-2.58,2.26-4.4,2.26c-2.98,0-5.4-2.42-5.4-5.4c0-1.81,0.89-3.42,2.26-4.4C12.92,3.04,12.46,3,12,3L12,3z"></path></svg></button></div><div class="searchBox_ZlJk"></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0"><main class="container container--fluid margin-vert--lg"><div class="row mdxPageWrapper_j9I6"><div class="col col--8"><article><h1>BP-31: BookKeeper Durability (Anchor)</h1><h2 class="anchor anchorWithStickyNavbar_LWe7" id="motivation">Motivation<a href="#motivation" class="hash-link" aria-label="Direct link to Motivation" title="Direct link to Motivation"></a></h2><p>Apache BookKeeper is transitioning into a full fledged distributed storage that can keep the data for long term. Durability is paramount to achieve the status of trusted store. This Anchor BP discusses many gaps and areas of improvement. Each issue listed here will have another issue and this BP is expected to be updated when that issue is created.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="durability-contract">Durability Contract<a href="#durability-contract" class="hash-link" aria-label="Direct link to Durability Contract" title="Direct link to Durability Contract"></a></h2><ol><li><strong>Maintain WQ copies all the time</strong>. If any ledger falls into under replicated state, there needs to be an SLA on how quickly the replication levels can be brought back to normal levels.</li><li><strong>Enforce Placement Policy</strong> strictly during write and replication.</li><li><strong>Protect the data</strong> against corruption on the wire or at rest.</li></ol><h2 class="anchor anchorWithStickyNavbar_LWe7" id="work-grouping-in-the-order-of-priority">Work Grouping (In the order of priority)<a href="#work-grouping-in-the-order-of-priority" class="hash-link" aria-label="Direct link to Work Grouping (In the order of priority)" title="Direct link to Work Grouping (In the order of priority)"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="detect-durability-validation">Detect Durability Validation<a href="#detect-durability-validation" class="hash-link" aria-label="Direct link to Detect Durability Validation" title="Direct link to Detect Durability Validation"></a></h3><p>First step is to understand the areas of durability breaches. Design metrics that record durability contract violations. </p><ul><li>At the Creation: Validate durability contract when the ledger is being created</li><li>At the Deletion: Validate durability contract when ledger is deleted</li><li>During lifetime: Validate durability contract during the lifetime of the ledger.(periodic validator)</li><li>During Read: IO or Checksum errors in the read path</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="delete-discipline">Delete Discipline<a href="#delete-discipline" class="hash-link" aria-label="Direct link to Delete Discipline" title="Direct link to Delete Discipline"></a></h3><ul><li>Build a single delete choke point with stringent validations</li><li>Archival bit in the metadata to assist Two phase Deletes</li><li>Stateful/Explicit Deletes</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="metadata-recovery">Metadata Recovery<a href="#metadata-recovery" class="hash-link" aria-label="Direct link to Metadata Recovery" title="Direct link to Metadata Recovery"></a></h3><ul><li>Metadata recovery tool to reconstruct the metadata if the metadata server gets wiped out. This tool need to make sure that the data is readable even if we can&#x27;t get all the metadata (ex: ctime) back.</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="plug-durability-violations">Plug Durability Violations<a href="#plug-durability-violations" class="hash-link" aria-label="Direct link to Plug Durability Violations" title="Direct link to Plug Durability Violations"></a></h3><p>Our first step is to identify durability viloations. That gives us the magnitude of the problem and areas that we need to focus. In this phase, fix high impact areas.</p><ul><li>Identify source of problems detected by the work we did in step-1 above (Detect Durability Validation)</li><li>Rereplicate under replicated ledgers detected during write</li><li>Rereplicate under replicated / corrupted ledgers detected during read</li><li>Replicated under replicated ledgers identified by periodic validator.</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="durability-test">Durability Test<a href="#durability-test" class="hash-link" aria-label="Direct link to Durability Test" title="Direct link to Durability Test"></a></h3><ul><li>Test plan, new tests and integrating it into CI pipeline. </li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="introduce-bookie-incarnation">Introduce bookie incarnation<a href="#introduce-bookie-incarnation" class="hash-link" aria-label="Direct link to Introduce bookie incarnation" title="Direct link to Introduce bookie incarnation"></a></h3><ul><li>Design/Implement bookie incarnation mechanism </li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="end-2-end-checksum">End 2 End Checksum<a href="#end-2-end-checksum" class="hash-link" aria-label="Direct link to End 2 End Checksum" title="Direct link to End 2 End Checksum"></a></h3><ul><li>Efficient checksum implementation (crc32c?)</li><li>Implement checksum validation on bookies in the write path. </li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="soft-deletes">Soft Deletes<a href="#soft-deletes" class="hash-link" aria-label="Direct link to Soft Deletes" title="Direct link to Soft Deletes"></a></h3><ul><li>Design and implement soft delete feature</li></ul><h3 class="anchor anchorWithStickyNavbar_LWe7" id="bitrot-detection">BitRot detection<a href="#bitrot-detection" class="hash-link" aria-label="Direct link to BitRot detection" title="Direct link to BitRot detection"></a></h3><ul><li>Design and implement bitrot detection/correction.</li></ul><h2 class="anchor anchorWithStickyNavbar_LWe7" id="durability-contract-violations">Durability Contract Violations<a href="#durability-contract-violations" class="hash-link" aria-label="Direct link to Durability Contract Violations" title="Direct link to Durability Contract Violations"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="write-errors-beyond-aq-are-ignored">Write errors beyond AQ are ignored.<a href="#write-errors-beyond-aq-are-ignored" class="hash-link" aria-label="Direct link to Write errors beyond AQ are ignored." title="Direct link to Write errors beyond AQ are ignored."></a></h3><p>BK client library transparently corrects any write errors while writing to bookie by changing the ensemble. Take a case where <code>WQ:3 and AQ:2</code>. This works fine only if the write fails to the bookie before it gets 2 successful responses. But if the 3rd bookie write fails <strong>after</strong> 2 successful responses and the response sent to client, this error is logged and no immediate action is taken to bring up the replication of the entry.
This case <strong>may not be</strong> detected by the auditor’s periodic ledger check. Given that we allow out of order write, that in the combination of 2 out of 3 to satisfy client, it is possible to have under replication in the middle of the ensemble entry. Hence ledgercheck is not going to find all under replication cases, on top of that, periodic ledger check is a complete sweep of the store, an very expensive and slow crawl hence defaulted to once a week run.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="strict-enforcement-of-placement-policy">Strict enforcement of placement policy<a href="#strict-enforcement-of-placement-policy" class="hash-link" aria-label="Direct link to Strict enforcement of placement policy" title="Direct link to Strict enforcement of placement policy"></a></h3><p>The best effort placement policy increases the write availability but at the cost of durability. Due to this non-strict placement, BK can’t guarantee data availability when a fault domain (rack) is lost. This also makes rolling upgrade across fault domains more difficult/non possible. Need to enforce strict ensemble placement and fail the write if all WQ copies are not able to be placed across different fault domains. Minor fix/enhancement if we agree to give placement higher priority than a successful write(availability)</p><p>The auditor re-replication uses client library to find a replacement bookie for each ledger in the lost bookie. But bookies are unaware of the ledger ensemble placement policy as this information is not part of metadata. </p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="detect-and-act-on-ledger-disk-problems">Detect and act on Ledger disk problems<a href="#detect-and-act-on-ledger-disk-problems" class="hash-link" aria-label="Direct link to Detect and act on Ledger disk problems" title="Direct link to Detect and act on Ledger disk problems"></a></h3><p>While Auditor mechanism detects complete bookie crash, there is no mechanism to detect individual ledger disk errors. So if a ledger disk goes bad, bookie continues to run, and auditor can’t recognize under replication condition, until it runs the complete sweep, periodic ledger check. On the other hand bookie refuses to come up if it finds a bad disk, which is right thing to do. This is easy to fix, in the interleaved ledger manger bad disk handle.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="checksum-at-bookies-in-the-write-path">Checksum at bookies in the write path<a href="#checksum-at-bookies-in-the-write-path" class="hash-link" aria-label="Direct link to Checksum at bookies in the write path" title="Direct link to Checksum at bookies in the write path"></a></h3><p>Lack of checksum calculations on the write path makes the store not to detect any corruption at the source issues. Imagine NIC issues on the client. If data gets corrupted at the client NIC’s level it safely gets stored on bookies (for the lack of crc calculations in the write path). This is a silent corruption of all 3 copies. For additional durability guarantees we can add checksum verification on bookies in the write path. Checksum calculations are cpu intensive and will add to the latency. But Java9 is adding native support for CRC32-C - A hardware assisted CRC calculation. We can consider adding this additional during JAVA-9 port after evaluating performance tradeoffs. </p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="no-repair-in-the-read-path">No repair in the read path<a href="#no-repair-in-the-read-path" class="hash-link" aria-label="Direct link to No repair in the read path" title="Direct link to No repair in the read path"></a></h3><p>When a checksum error is detected, in addition to finding good replica, sfstore need to repair(replace with good one) bad replica too.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="operations">Operations<a href="#operations" class="hash-link" aria-label="Direct link to Operations" title="Direct link to Operations"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="no-bookie-incarnation-mechanism">No bookie incarnation mechanism<a href="#no-bookie-incarnation-mechanism" class="hash-link" aria-label="Direct link to No bookie incarnation mechanism" title="Direct link to No bookie incarnation mechanism"></a></h3><p>A bookie <code>B1 at time t1</code> ; and same bookie <code>B1 at time t2</code> after bookie format are treated in the same way.
For this to cause any durability issues:</p><ul><li>Replication/Auditor mechanism is stopped or not running for some reason. (A stuck auditor will start a new one due to ZK)</li><li>One of bookies(B1) went down (crash or something)</li><li>B1’s Journal dir and all ledger dir got wiped.</li><li>B1 came back to life as a fresh bookie</li><li>Auditor is enabled monitoring again</li></ul><p>At this point auditor doesn’t have capability to know that the B1 in the cluster is not the same B1 that it used to be. Hence doesn’t consider it for under replication. This is a pathological scenario but we at least need to have a mechanism to identify and alert this scenario if not taking care of bookie incarnation issue.</p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="enhancements">Enhancements<a href="#enhancements" class="hash-link" aria-label="Direct link to Enhancements" title="Direct link to Enhancements"></a></h2><h3 class="anchor anchorWithStickyNavbar_LWe7" id="delete-choke-points">Delete Choke Points<a href="#delete-choke-points" class="hash-link" aria-label="Direct link to Delete Choke Points" title="Direct link to Delete Choke Points"></a></h3><p>Every delete must go through single routine/path in the code and that needs to implement additional checks to perform physical delete.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="archival-bit-in-the-metadata-to-assist-two-phase-deletes">Archival bit in the metadata to assist Two phase Deletes<a href="#archival-bit-in-the-metadata-to-assist-two-phase-deletes" class="hash-link" aria-label="Direct link to Archival bit in the metadata to assist Two phase Deletes" title="Direct link to Archival bit in the metadata to assist Two phase Deletes"></a></h3><p>Main aim of this feature is to be as conservative as possible on the delete path. As explained in the stateful explicit deletes section, lack of ledgerId in the metadata means that ledger is deleted. A bug in the client code may erroneously delete the ledger. To protect from that, we want to introduce a archive/backedup bit. A separate backup/archival application can mark the bit after successfully backing up the ledger, and later on main client application will send the delete. If this feature is enabled, BK client will reject and throw an exception if it receives a delete request without archival/backed-up bit is not set. This protects the data from bugs and erroneous deletes.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="stateful-explicit-deltes">Stateful explicit deltes<a href="#stateful-explicit-deltes" class="hash-link" aria-label="Direct link to Stateful explicit deltes" title="Direct link to Stateful explicit deltes"></a></h3><p>Current bookkeeper deletes synchronously deletes the metadata in the zookeeper. Bookies implicitly assume that a particular ledger is deleted if it is not present in the metadata. This process has no crosscheck if the ledger is actually deleted. Any ZK corruption or loss of the ledger path znodes will make bookies to delete data on the disk. No cross check. Even bugs in bookie code which ‘determines’ if a ledger is present on the zk or not, may lead to data deletion. </p><p>Right way to deal with this is to asynchronously delete metadata after each bookie explicitly checks that a particular ledger is deleted. This way each bookie explicitly checks the ‘delete state’ of the ledger before deleting on the disk data. One of the proposal is to move the deleted ledgers under /deleted/<!-- -->&lt;<!-- -->ledgerId<!-- -->&gt;<!-- --> other idea is to add a delete state, Open-&gt;Closed-&gt;Deleted.</p><p>As soon as we make the metadata deletions asynchronous, the immediate question is who will delete it?
Option-1: A centralized process like auditor will be responsible for deleting metadata after each bookie deletes on disk data.
Option-2: A decentralized, more complicated approach: Last bookie that deletes its on disk data, deletes the metadata too.
I am sure there can be more ideas. Any path will need a detailed design and need to consider many corner cases.</p><h4 class="anchor anchorWithStickyNavbar_LWe7" id="obvious-points-to-consider">Obvious points to consider:<a href="#obvious-points-to-consider" class="hash-link" aria-label="Direct link to Obvious points to consider:" title="Direct link to Obvious points to consider:"></a></h4><p>ZK as-is heavily loaded with BK metadata. Keeping these znodes around for more time ineeded puts more pressure on ZK.
If a bookie is down for long time, what would be the delete policy for the metadata?
There will be lots of corner case scenarios we need to deal with. For example:
A bookie-1 hosting data for ledger-1 is down for long time
Ledger-1 data has been replicated to other bookies
Ledger-1 is deleted, and its data and metadata is clared.
Now bookie-1 came back to life. Since our policy is ‘explicit state check delete’ bookie-1 can’t delete ledger-1 data as it can’t explicitly validate that the ledger-1 has been deleted.
One possible solution: keep tomestones of deleted ledgers around for some duration. If a bookie is down for more than that duration, it needs to be decommissioned and add as a new bookie.
Enhance: Archival bit in the metadata to assist Two phase Deletes
Main aim of this feature is to be as conservative as possible on the delete path. As explained in the stateful explicit deletes section, lack of ledgerId in the metadata means that ledger is deleted. A bug in the client code may erroneously delete the ledger. To protect from that, we want to introduce a archive/backedup bit. A separate backup/archival application can mark the bit after successfully backing up the ledger, and later on main client application will send the delete. If this feature is enabled, BK client will reject and throw an exception if it receives a delete request without archival/backed-up bit is not set. This protects the data from bugs and erroneous deletes.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="metadata-recovery-tool">Metadata recovery tool<a href="#metadata-recovery-tool" class="hash-link" aria-label="Direct link to Metadata recovery tool" title="Direct link to Metadata recovery tool"></a></h3><p>In case zookkeper completely wiped we need a way to reconstruct enough metadata to read ledgers back. Currently metadata contains ensemble information which is critical for reading ledgers back, and also it has additional metadata like ctime and custom metadata. Every bookie has one index file per ledger and that has enough information to reconstruct the ensemble information so that the ledgers can be made readable. This tool can be built in two ways.
If ZK is completely wiped, reconstruct entire data from bookie index files.
If ZK is completely wiped, but snapshots are available, restore ZK from snapshots and built the delta from bookie index files.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="bit-rot-detection-bp-24">Bit Rot Detection (BP-24)<a href="#bit-rot-detection-bp-24" class="hash-link" aria-label="Direct link to Bit Rot Detection (BP-24)" title="Direct link to Bit Rot Detection (BP-24)"></a></h3><p>If the data stays on the disk for long time(years), it is possible to experience silent data degradation on the disk. In the current scenario we will not identify this until the data is read by the application.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="end-to-end-checksum">End to end checksum<a href="#end-to-end-checksum" class="hash-link" aria-label="Direct link to End to end checksum" title="Direct link to End to end checksum"></a></h3><p>Bookies never validate the payload checksum. If the client’s socket has issues, it might corrupt the data (at the source) and it won’t be detected until client reads it back. That will be too late as the original write was successful for the application. Use efficient checksum mechanisms and enforce checksum validations on the bookie’s write path. If checksum validation fails, write itself will fail and application will be notified. </p><h2 class="anchor anchorWithStickyNavbar_LWe7" id="test-strategy-to-validate-durability">Test strategy to validate durability<a href="#test-strategy-to-validate-durability" class="hash-link" aria-label="Direct link to Test strategy to validate durability" title="Direct link to Test strategy to validate durability"></a></h2><p>BK need to develop a comprehensive testing strategy to test and validate the store’s durability. Various methods and levels are tests are needed to gain confidence for deploying the store in production. Specific points are mentioned here and these are in addition to regular functional testing/validation.</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="white-box-error-injection">White box error injection<a href="#white-box-error-injection" class="hash-link" aria-label="Direct link to White box error injection" title="Direct link to White box error injection"></a></h3><p>Introduce all possible errors in the write path, kick replication mechanism and make sure cluster reached desired replica levels.
Corrupt first readable copy and make sure that the corruption is detected on the read path, and ultimately read must succeed after trying second replica.
Corrupt packet after checksum calculation on the client and make sure that it is detected in the read path, and ultimately read fails as this is corruption at the source.
After a write make sure that the replica is distributed across fault zones.
Kill a bookie, make sure that the auditor detected and replicated all ledgers in that bookie according to allocation policy (across fault zones)</p><h3 class="anchor anchorWithStickyNavbar_LWe7" id="black-box-error-injection-chaos-monkey">Black box error injection (Chaos Monkey)<a href="#black-box-error-injection-chaos-monkey" class="hash-link" aria-label="Direct link to Black box error injection (Chaos Monkey)" title="Direct link to Black box error injection (Chaos Monkey)"></a></h3><p>While keeping longevity testing which is doing continues IO to the store introduce following errors.
Kill random bookie and reads should continue.
Kill random bookies keeping minimum fault zones to satisfy AQ Quorum during write workload.
Simulate disk errors in random bookies and allow the bookie to go down and replication gets started.
Make sure that the cluster is running in full durable state through the tools and monitoring built.</p></article></div><div class="col col--2"><div class="tableOfContents_bqdL thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#motivation" class="table-of-contents__link toc-highlight">Motivation</a></li><li><a href="#durability-contract" class="table-of-contents__link toc-highlight">Durability Contract</a></li><li><a href="#work-grouping-in-the-order-of-priority" class="table-of-contents__link toc-highlight">Work Grouping (In the order of priority)</a><ul><li><a href="#detect-durability-validation" class="table-of-contents__link toc-highlight">Detect Durability Validation</a></li><li><a href="#delete-discipline" class="table-of-contents__link toc-highlight">Delete Discipline</a></li><li><a href="#metadata-recovery" class="table-of-contents__link toc-highlight">Metadata Recovery</a></li><li><a href="#plug-durability-violations" class="table-of-contents__link toc-highlight">Plug Durability Violations</a></li><li><a href="#durability-test" class="table-of-contents__link toc-highlight">Durability Test</a></li><li><a href="#introduce-bookie-incarnation" class="table-of-contents__link toc-highlight">Introduce bookie incarnation</a></li><li><a href="#end-2-end-checksum" class="table-of-contents__link toc-highlight">End 2 End Checksum</a></li><li><a href="#soft-deletes" class="table-of-contents__link toc-highlight">Soft Deletes</a></li><li><a href="#bitrot-detection" class="table-of-contents__link toc-highlight">BitRot detection</a></li></ul></li><li><a href="#durability-contract-violations" class="table-of-contents__link toc-highlight">Durability Contract Violations</a><ul><li><a href="#write-errors-beyond-aq-are-ignored" class="table-of-contents__link toc-highlight">Write errors beyond AQ are ignored.</a></li><li><a href="#strict-enforcement-of-placement-policy" class="table-of-contents__link toc-highlight">Strict enforcement of placement policy</a></li><li><a href="#detect-and-act-on-ledger-disk-problems" class="table-of-contents__link toc-highlight">Detect and act on Ledger disk problems</a></li><li><a href="#checksum-at-bookies-in-the-write-path" class="table-of-contents__link toc-highlight">Checksum at bookies in the write path</a></li><li><a href="#no-repair-in-the-read-path" class="table-of-contents__link toc-highlight">No repair in the read path</a></li></ul></li><li><a href="#operations" class="table-of-contents__link toc-highlight">Operations</a><ul><li><a href="#no-bookie-incarnation-mechanism" class="table-of-contents__link toc-highlight">No bookie incarnation mechanism</a></li></ul></li><li><a href="#enhancements" class="table-of-contents__link toc-highlight">Enhancements</a><ul><li><a href="#delete-choke-points" class="table-of-contents__link toc-highlight">Delete Choke Points</a></li><li><a href="#archival-bit-in-the-metadata-to-assist-two-phase-deletes" class="table-of-contents__link toc-highlight">Archival bit in the metadata to assist Two phase Deletes</a></li><li><a href="#stateful-explicit-deltes" class="table-of-contents__link toc-highlight">Stateful explicit deltes</a></li><li><a href="#metadata-recovery-tool" class="table-of-contents__link toc-highlight">Metadata recovery tool</a></li><li><a href="#bit-rot-detection-bp-24" class="table-of-contents__link toc-highlight">Bit Rot Detection (BP-24)</a></li><li><a href="#end-to-end-checksum" class="table-of-contents__link toc-highlight">End to end checksum</a></li></ul></li><li><a href="#test-strategy-to-validate-durability" class="table-of-contents__link toc-highlight">Test strategy to validate durability</a><ul><li><a href="#white-box-error-injection" class="table-of-contents__link toc-highlight">White box error injection</a></li><li><a href="#black-box-error-injection-chaos-monkey" class="table-of-contents__link toc-highlight">Black box error injection (Chaos Monkey)</a></li></ul></li></ul></div></div></div></main></div><footer class="footer footer--dark"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">Documentation</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/docs/overview">Overview</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/getting-started/installation">Getting started</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/deployment/manual">Deployment</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/admin/bookies">Administration</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/api/overview">API</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/security/overview">Security</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/development/protocol">Development</a></li><li class="footer__item"><a class="footer__link-item" href="/docs/reference/config">Reference</a></li></ul></div><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/community/mailing-lists">Mailing lists</a></li><li class="footer__item"><a class="footer__link-item" href="/community/slack">Slack</a></li><li class="footer__item"><a href="https://github.com/apache/bookkeeper" target="_blank" rel="noopener noreferrer" class="footer__link-item">Github<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://twitter.com/asfbookkeeper" target="_blank" rel="noopener noreferrer" class="footer__link-item">Twitter<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><div class="col footer__col"><div class="footer__title">Project</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/project/who">Who are we?</a></li><li class="footer__item"><a class="footer__link-item" href="/project/bylaws">Bylaws</a></li><li class="footer__item"><a href="https://apache.org/licenses" target="_blank" rel="noopener noreferrer" class="footer__link-item">License<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a class="footer__link-item" href="/project/privacy">Privacy policy</a></li><li class="footer__item"><a href="https://www.apache.org/foundation/sponsorship.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Sponsorship<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://www.apache.org/foundation/thanks.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Thanks<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright"><footer class="footer">
<div class="container">
<div class="content has-text-centered">
<p>
Copyright &copy; 2016 - 2024 <a href="https://www.apache.org/">The Apache Software Foundation</a>,<br> licensed under the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, version 2.0</a>.
</p>
<p>
Apache BookKeeper, BookKeeper®, Apache®, the Apache feature logo, and the Apache BookKeeper logo are either registered trademarks or trademarks of The Apache Software Foundation.
</p>
</div>
</div>
</footer>
</div></div></div></footer></div>
<script src="/assets/js/runtime~main.793d926f.js"></script>
<script src="/assets/js/main.c5d52852.js"></script>
</body>
</html>