blob: 5176bbadeb1325d9d822d6d7eef43e3b6fb88bdf [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<title>Apache BookKeeper&trade; - BP-28: use etcd as metadata store</title>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="/css/normalize.css">
<link rel="stylesheet" href="/css/tippy.css">
<link rel="stylesheet" href="/css/style.css">
<link rel="shortcut icon" href="/img/favicon.ico">
<script src="/js/tippy.min.js"></script>
<script type="text/javascript">
var shiftWindow = function() { scrollBy(0, -25); };
window.addEventListener("hashchange", shiftWindow);
window.addEventListener("pageshow", shiftWindow);
function load() { if (window.location.hash) shiftWindow(); }
</script>
</head>
<body class="body">
<main class="main">
<nav class="navbar bk-topnav">
<div class="navbar-brand">
<a class="navbar-item bk-brand" href="/">
Apache BookKeeper&trade;
</a>
<div class="navbar-burger burger" data-target="bkNav">
<span></span>
<span></span>
<span></span>
</div>
</div>
<div id="bkNav" class="navbar-menu">
<div class="navbar-start">
<div class="navbar-item has-dropdown is-hoverable">
<a class="navbar-link">Documentation</a>
<div class="navbar-dropdown is-boxed">
<a class="navbar-item" href="/docs/latest/overview/overview">
Version 4.15.0-SNAPSHOT
<span class="tag is-warning">Development</span>
</a>
<a class="navbar-item" href="/docs/latest/api/javadoc">
<span class="icon bk-javadoc-icon">
<img src="/img/java-icon.svg">
</span>
Javadoc
</a>
<hr class="dropdown-divider">
<a class="navbar-item" href="/docs/4.14.0/overview/overview">
Release 4.14.0
</a>
<a class="navbar-item" href="/docs/4.13.0/overview/overview">
Release 4.13.0
</a>
<a class="navbar-item" href="/docs/4.12.1/overview/overview">
Release 4.12.1
</a>
<a class="navbar-item" href="/docs/4.12.0/overview/overview">
Release 4.12.0
</a>
<a class="navbar-item" href="/docs/4.11.1/overview/overview">
Release 4.11.1
<span class="tag is-success">Stable</span>
</a>
<a class="navbar-item" href="/docs/4.11.0/overview/overview">
Release 4.11.0
</a>
<a class="navbar-item" href="/docs/4.10.0/overview/overview">
Release 4.10.0
</a>
<a class="navbar-item" href="/archives/docs/r4.9.2">
Release 4.9.2
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.9.1">
Release 4.9.1
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.9.0">
Release 4.9.0
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.8.2">
Release 4.8.2
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.8.1">
Release 4.8.1
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.8.0">
Release 4.8.0
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.7.3">
Release 4.7.3
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.7.2">
Release 4.7.2
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.7.1">
Release 4.7.1
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.7.0">
Release 4.7.0
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.6.2">
Release 4.6.2
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.6.1">
Release 4.6.1
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.6.0">
Release 4.6.0
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.5.1">
Release 4.5.1
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.5.0">
Release 4.5.0
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.4.0">
Release 4.4.0
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.3.2">
Release 4.3.2
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.3.1">
Release 4.3.1
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.3.0">
Release 4.3.0
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.2.4">
Release 4.2.4
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.2.3">
Release 4.2.3
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.2.2">
Release 4.2.2
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.2.1">
Release 4.2.1
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.2.0">
Release 4.2.0
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.1.0">
Release 4.1.0
<span class="tag is-warning">EOL</span>
</a>
<a class="navbar-item" href="/archives/docs/r4.0.0">
Release 4.0.0
<span class="tag is-warning">EOL</span>
</a>
</div>
</div>
<div class="navbar-item has-dropdown is-hoverable">
<a class="navbar-link">Community</a>
<div class="navbar-dropdown is-boxed">
<a class="navbar-item" href="/community/mailing-lists">Mailing lists</a>
<a class="navbar-item" href="/community/slack">Slack</a>
<a class="navbar-item" href="https://github.com/apache/bookkeeper/issues">Github Issues</a>
<a class="navbar-item" href="/community/releases">Release Management</a>
<a class="navbar-item" href="/community/meeting">Community Meetings</a>
<hr class="dropdown-divider">
<a class="navbar-item" href="/community/contributing">Contribution Guide</a>
<a class="navbar-item" href="/community/coding_guide">Coding Guide</a>
<a class="navbar-item" href="/community/testing">Testing Guide</a>
<a class="navbar-item" href="/community/issue-report">Issue Report Guide</a>
<a class="navbar-item" href="/community/release_guide">Release Guide</a>
<hr class="dropdown-divider">
<a class="navbar-item" href="/community/presentations">Presentations</a>
<a class="navbar-item" href="/community/bookkeeper_proposals">BookKeeper Proposals</a>
</div>
</div>
<div class="navbar-item has-dropdown is-hoverable">
<a class="navbar-link">Project</a>
<div class="navbar-dropdown is-boxed">
<a class="navbar-item" href="/project/who">Who are we?</a>
<a class="navbar-item" href="/project/bylaws">Bylaws</a>
<a class="navbar-item" href="http://www.apache.org/licenses/">License</a>
<hr class="dropdown-divider">
<a class="navbar-item" href="/project/privacy">Privacy policy</a>
<a class="navbar-item" href="http://www.apache.org/foundation/sponsorship.html">Sponsorship</a>
<a class="navbar-item" href="http://www.apache.org/foundation/thanks.html">Thanks</a>
</div>
</div>
</div>
<div class="navbar-end">
<div class="navbar-item">
<div class="field is-grouped">
<p class="control">
<a class="button bk-twitter" href="https://twitter.com/asfbookkeeper">
<span class="icon">
<i class="fa fa-twitter"></i>
</span>
<span>Twitter</span>
</a>
</p>
<p class="control">
<a class="button" href="https://github.com/apache/bookkeeper">
<span class="icon">
<i class="fa fa-github"></i>
</span>
<span>GitHub</span>
</a>
</p>
<p class="control">
<a class="button is-primary" href="/releases">
<span class="icon">
<i class="fa fa-download"></i>
</span>
<span>Download</span>
</a>
</p>
</div>
</div>
</div>
</div>
</nav>
<div class="bk-community-container">
<div class="columns">
<div class="column is-12">
<header class="docs-title">
<nav class="level">
<div class="level-left">
<div class="level-item">
<h1 class="title">BP-28: use etcd as metadata store</h1>
</div>
</div>
</nav>
</header>
<hr />
<div class="content is-medium">
<section class="bk-community-content">
<h3 id="motivation">Motivation</h3>
<p>Currently bookkeeper uses zookeeper as the metadata store. However there is a couple of issues with current approach, especially using zookeeper.</p>
<p>These issues includes:</p>
<ol>
<li>You need to allocate special nodes for zookeeper. These nodes need to be treated specially, and have their own monitoring.
Ops need to understand both bookies and zookeeper.</li>
<li>ZooKeeper is the scalability bottleneck. ZooKeeper doesn’t scale writes as you add nodes. This means that if your bookkeeper
cluster reaches the maximum write throughput that ZK can sustain, you’ve reached the maximum capacity of your cluster, and there’s nothing you
can do (except buy bigger hardware for your special nodes).</li>
<li>ZooKeeper enforces you into its programming model. In general, its programming model is not too bad. However it becomes problematic when
the scale goes up (e.g. the number of clients and watcher increase). The issues usually comes from <em>session expires</em> and <em>watcher</em>.
<ul>
<li><em>Session Expires</em>: For simplicity, ZooKeeper ties session state directly with connection state. So when a connection is broken, a session is usually expired (unless it reconnects before session expires), and when a session is expired, the underlying connection can not be used anymore, the application has to close the connection and re-establish a new client (a new connection). It is understandable that it makes zookeeper development easy. However in reality, it means if you can not establish a session, you can’t use this connection and you have to create new connections. Once your zookeeper cluster is in a bad state (e.g. network issue or jvm gc), the whole cluster is usually unable to recover because of the connection storm introduced by session expires.</li>
<li><em>Watchers</em>: The zookeeper watcher is one time watcher, applications can’t reliably use it to get updates. In order to set a watcher, you have to read a znode or get children. Imagine such a use case, clients are watching a list of znodes (e.g. list of bookies), when those clients expire, they have to get the list of znodes in order to rewatch the list, even the list is never changed.</li>
<li>The combination of session expires and watchers is often the root cause of critical zookeeper outages.</li>
</ul>
</li>
</ol>
<p>This proposal is to explore other existing systems such as etcd as the metadata store. Using Etcd doesn’t address concerns #1, however it might potentially
address concern #2 and #3 to some extend. And if you are running bookkeeper in k8s, there is already an Etcd instance available. It can become easier to run
bookkeeper on k8s if we can use Etcd as the metadata store.</p>
<p>NOTE: this proposal has some overlaps on goals/vision with the distributed k/v store work (a separate BP will be sent out soon). However they are not conflicting with each other.
Both proposals are exploring a better metadata storage solution for bookkeeper.</p>
<h3 id="public-interfaces">Public Interfaces</h3>
<p>A new metadata store module <code class="highlighter-rouge">metadata-store-etcd</code> will be added to bookkeeper. This module will be implementing all the required metadata interfaces:</p>
<p>These interfaces include:</p>
<ul>
<li>RegistrationClient</li>
<li>RegistrationManager</li>
<li>LayoutManager</li>
<li>LedgerIdGenerator</li>
<li>LedgerManager</li>
<li>LedgerUnderreplicatedManager</li>
<li>LedgerManagerFactory</li>
</ul>
<h3 id="proposed-changes">Proposed Changes</h3>
<p>Since Etcd provides a key/value model rather than a tree-like structure. So the metadata will be organized in a key/value way as below:</p>
<ul>
<li><code class="highlighter-rouge">scope</code>: the prefix used for prefixing all the keys used for storing the metadata for a given cluster. The <code class="highlighter-rouge">scope</code> is effectively same as <code class="highlighter-rouge">zkLedgerRootPath</code>.</li>
<li><code class="highlighter-rouge">scope</code>/LAYOUT: key for storing layout data</li>
<li><code class="highlighter-rouge">scope</code>/INSTANCEID: key for storing instance id</li>
<li><code class="highlighter-rouge">scope</code>/IDGEN/<code class="highlighter-rouge">&lt;bucket&gt;</code>: key for id generation. <code class="highlighter-rouge">&lt;bucket&gt;</code> is to allow concurrent id generation.</li>
<li><code class="highlighter-rouge">scope</code>/cookies/<code class="highlighter-rouge">&lt;bookieid&gt;</code>: key for storing a bookie’s cookie</li>
<li><code class="highlighter-rouge">scope</code>/available/readwrite/<code class="highlighter-rouge">&lt;bookieid&gt;</code>: key for registering readwrite bookie. (lease will be applied to this key)</li>
<li><code class="highlighter-rouge">scope</code>/available/readonly/<code class="highlighter-rouge">&lt;bookieid&gt;</code>: key for registering readonly bookie. (lease will be applied to this key)</li>
<li><code class="highlighter-rouge">scope</code>/ledgers/<code class="highlighter-rouge">&lt;ledgerid&gt;</code>: key for storing a ledger metadata. <code class="highlighter-rouge">&lt;ledgerid&gt;</code> is <code class="highlighter-rouge">String.format("%19d", ledgerId)</code>, pre-padding with 0s to make sure ledgers are stored in order.</li>
</ul>
<h4 id="registration-manager">Registration Manager</h4>
<ul>
<li>Bookie Register: write to key “<code class="highlighter-rouge">scope</code>/available/readwrite/<code class="highlighter-rouge">&lt;bookieid&gt;</code>” with a keepalive lease.</li>
<li>Bookie Unregister: delete key “<code class="highlighter-rouge">scope</code>/available/readwrite/<code class="highlighter-rouge">&lt;bookieid&gt;</code>”.</li>
</ul>
<h4 id="registration-client">Registration Client</h4>
<ul>
<li>Get readwrite bookies: range operation to fetch keys between “<code class="highlighter-rouge">scope</code>/available/readwrite/” and “<code class="highlighter-rouge">scope</code>/available/readwrite_end/”.</li>
<li>Get readonly bookies: range operation to fetch keys between “<code class="highlighter-rouge">scope</code>/available/readonly/” and “<code class="highlighter-rouge">scope</code>/available/readonly_end/”.</li>
<li>Watch readwrite bookies: watch operation to watch keys between “<code class="highlighter-rouge">scope</code>/available/readwrite/” and “<code class="highlighter-rouge">scope</code>/available/readwrite_end/”.</li>
<li>Watch readonly bookies: watch operation to watch keys between “<code class="highlighter-rouge">scope</code>/available/readonly/” and “<code class="highlighter-rouge">scope</code>/available/readonly_end/”.</li>
</ul>
<h4 id="layout-manager">Layout Manager</h4>
<ul>
<li>Create layout: a txn operation: put layout data to key “<code class="highlighter-rouge">scope</code>/LAYOUT” when this key doesn’t exist.</li>
<li>Delete layout: a delete operation on “<code class="highlighter-rouge">scope</code>/LAYOUT”</li>
<li>Read layout: a get operation on “<code class="highlighter-rouge">scope</code>/LAYOUT”</li>
</ul>
<h4 id="ledger-id-generator">Ledger Id Generator</h4>
<ul>
<li>Id generation: get key, increment by 1 and then update with a txn operation.</li>
</ul>
<h4 id="ledger-manager">Ledger Manager</h4>
<ul>
<li>Create Ledger: a txn operation to put ledger metadata to key “<code class="highlighter-rouge">scope</code>/ledgers/<code class="highlighter-rouge">&lt;ledgerid&gt;</code>” when this key doesn’t exist</li>
<li>Write Ledger: a txn operation to put ledger metadata to key “<code class="highlighter-rouge">scope</code>/ledgers/<code class="highlighter-rouge">&lt;ledgerid&gt;</code>” when key version matches</li>
<li>Delete Ledger: a delete operation on key “<code class="highlighter-rouge">scope</code>/ledgers/<code class="highlighter-rouge">&lt;ledgerid&gt;</code></li>
<li>Read Ledger: a get operation on key “<code class="highlighter-rouge">scope</code>/ledgers/<code class="highlighter-rouge">&lt;ledgerid&gt;</code></li>
<li>Iteration: a range operation fetch keys between “<code class="highlighter-rouge">scope</code>/ledgers/” and “<code class="highlighter-rouge">scope</code>/ledgers_end/”</li>
</ul>
<h3 id="compatibility-deprecation-and-migration-plan">Compatibility, Deprecation, and Migration Plan</h3>
<ul>
<li>This is a new metadata store. No impacts to existing users.</li>
<li>We are not planning to implement any migration tools any time soon. This only for new users or setting up new clusters.</li>
</ul>
<h3 id="test-plan">Test Plan</h3>
<ul>
<li>Unit tests for all the metadata manager implementation.</li>
<li>End-to-end integration tests</li>
<li>Jepsen tests</li>
</ul>
<h3 id="rejected-alternatives">Rejected Alternatives</h3>
<p>N/A</p>
</section>
</div>
</div>
</div>
</div>
</main>
<footer class="footer">
<div class="container">
<div class="content has-text-centered">
<p>
Copyright &copy; 2016 - 2021 <a href="https://www.apache.org/">The Apache Software Foundation</a>,<br /> licensed under the <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache License, version 2.0</a>.
</p>
<p>
Apache BookKeeper, BookKeeper®, Apache®, the Apache feature logo, and the Apache BookKeeper logo are either registered trademarks or trademarks of The Apache Software Foundation.
</p>
</div>
</div>
</footer>
</body>
<script src="/js/app.js"></script>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-104419626-1', 'auto');
ga('send', 'pageview');
</script>
</html>