| <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> |
| <html> |
| <head> |
| <META http-equiv="Content-Type" content="text/html; charset=UTF-8"> |
| <meta content="Apache Forrest" name="Generator"> |
| <meta name="Forrest-version" content="0.9"> |
| <meta name="Forrest-skin-name" content="pelt"> |
| <title>ZooKeeper Internals</title> |
| <link type="text/css" href="skin/basic.css" rel="stylesheet"> |
| <link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet"> |
| <link media="print" type="text/css" href="skin/print.css" rel="stylesheet"> |
| <link type="text/css" href="skin/profile.css" rel="stylesheet"> |
| <script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script> |
| <link rel="shortcut icon" href="images/favicon.ico"> |
| </head> |
| <body onload="init()"> |
| <script type="text/javascript">ndeSetTextSize();</script> |
| <div id="top"> |
| <!--+ |
| |breadtrail |
| +--> |
| <div class="breadtrail"> |
| <a href="http://www.apache.org/">Apache</a> > <a href="http://hadoop.apache.org/">Hadoop</a> > <a href="http://hadoop.apache.org/zookeeper/">ZooKeeper</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script> |
| </div> |
| <!--+ |
| |header |
| +--> |
| <div class="header"> |
| <!--+ |
| |start group logo |
| +--> |
| <div class="grouplogo"> |
| <a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a> |
| </div> |
| <!--+ |
| |end group logo |
| +--> |
| <!--+ |
| |start Project Logo |
| +--> |
| <div class="projectlogo"> |
| <a href="http://hadoop.apache.org/zookeeper/"><img class="logoImage" alt="ZooKeeper" src="images/zookeeper_small.gif" title="ZooKeeper: distributed coordination"></a> |
| </div> |
| <!--+ |
| |end Project Logo |
| +--> |
| <!--+ |
| |start Search |
| +--> |
| <div class="searchbox"> |
| <form action="http://www.google.com/search" method="get" class="roundtopsmall"> |
| <input value="hadoop.apache.org" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google"> |
| <input name="Search" value="Search" type="submit"> |
| </form> |
| </div> |
| <!--+ |
| |end search |
| +--> |
| <!--+ |
| |start Tabs |
| +--> |
| <ul id="tabs"> |
| <li> |
| <a class="unselected" href="http://hadoop.apache.org/zookeeper/">Project</a> |
| </li> |
| <li> |
| <a class="unselected" href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a> |
| </li> |
| <li class="current"> |
| <a class="selected" href="index.html">ZooKeeper 3.3 Documentation</a> |
| </li> |
| </ul> |
| <!--+ |
| |end Tabs |
| +--> |
| </div> |
| </div> |
| <div id="main"> |
| <div id="publishedStrip"> |
| <!--+ |
| |start Subtabs |
| +--> |
| <div id="level2tabs"></div> |
| <!--+ |
| |end Endtabs |
| +--> |
| <script type="text/javascript"><!-- |
| document.write("Last Published: " + document.lastModified); |
| // --></script> |
| </div> |
| <!--+ |
| |breadtrail |
| +--> |
| <div class="breadtrail"> |
| |
| |
| </div> |
| <!--+ |
| |start Menu, mainarea |
| +--> |
| <!--+ |
| |start Menu |
| +--> |
| <div id="menu"> |
| <div onclick="SwitchMenu('menu_1.1', 'skin/')" id="menu_1.1Title" class="menutitle">Overview</div> |
| <div id="menu_1.1" class="menuitemgroup"> |
| <div class="menuitem"> |
| <a href="index.html">Welcome</a> |
| </div> |
| <div class="menuitem"> |
| <a href="zookeeperOver.html">Overview</a> |
| </div> |
| <div class="menuitem"> |
| <a href="zookeeperStarted.html">Getting Started</a> |
| </div> |
| <div class="menuitem"> |
| <a href="releasenotes.html">Release Notes</a> |
| </div> |
| </div> |
| <div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Developer</div> |
| <div id="menu_1.2" class="menuitemgroup"> |
| <div class="menuitem"> |
| <a href="api/index.html">API Docs</a> |
| </div> |
| <div class="menuitem"> |
| <a href="zookeeperProgrammers.html">Programmer's Guide</a> |
| </div> |
| <div class="menuitem"> |
| <a href="javaExample.html">Java Example</a> |
| </div> |
| <div class="menuitem"> |
| <a href="zookeeperTutorial.html">Barrier and Queue Tutorial</a> |
| </div> |
| <div class="menuitem"> |
| <a href="recipes.html">Recipes</a> |
| </div> |
| </div> |
| <div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">BookKeeper</div> |
| <div id="menu_1.3" class="menuitemgroup"> |
| <div class="menuitem"> |
| <a href="bookkeeperStarted.html">Getting started</a> |
| </div> |
| <div class="menuitem"> |
| <a href="bookkeeperOverview.html">Overview</a> |
| </div> |
| <div class="menuitem"> |
| <a href="bookkeeperConfig.html">Setup guide</a> |
| </div> |
| <div class="menuitem"> |
| <a href="bookkeeperProgrammer.html">Programmer's guide</a> |
| </div> |
| </div> |
| <div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Admin & Ops</div> |
| <div id="menu_1.4" class="menuitemgroup"> |
| <div class="menuitem"> |
| <a href="zookeeperAdmin.html">Administrator's Guide</a> |
| </div> |
| <div class="menuitem"> |
| <a href="zookeeperQuotas.html">Quota Guide</a> |
| </div> |
| <div class="menuitem"> |
| <a href="zookeeperJMX.html">JMX</a> |
| </div> |
| <div class="menuitem"> |
| <a href="zookeeperObservers.html">Observers Guide</a> |
| </div> |
| </div> |
| <div onclick="SwitchMenu('menu_selected_1.5', 'skin/')" id="menu_selected_1.5Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Contributor</div> |
| <div id="menu_selected_1.5" class="selectedmenuitemgroup" style="display: block;"> |
| <div class="menupage"> |
| <div class="menupagetitle">ZooKeeper Internals</div> |
| </div> |
| </div> |
| <div onclick="SwitchMenu('menu_1.6', 'skin/')" id="menu_1.6Title" class="menutitle">Miscellaneous</div> |
| <div id="menu_1.6" class="menuitemgroup"> |
| <div class="menuitem"> |
| <a href="http://wiki.apache.org/hadoop/ZooKeeper">Wiki</a> |
| </div> |
| <div class="menuitem"> |
| <a href="http://wiki.apache.org/hadoop/ZooKeeper/FAQ">FAQ</a> |
| </div> |
| <div class="menuitem"> |
| <a href="http://hadoop.apache.org/zookeeper/mailing_lists.html">Mailing Lists</a> |
| </div> |
| </div> |
| <div id="credit"></div> |
| <div id="roundbottom"> |
| <img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div> |
| <!--+ |
| |alternative credits |
| +--> |
| <div id="credit2"></div> |
| </div> |
| <!--+ |
| |end Menu |
| +--> |
| <!--+ |
| |start content |
| +--> |
| <div id="content"> |
| <div title="Portable Document Format" class="pdflink"> |
| <a class="dida" href="zookeeperInternals.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br> |
| PDF</a> |
| </div> |
| <h1>ZooKeeper Internals</h1> |
| <div id="front-matter"> |
| <div id="minitoc-area"> |
| <ul class="minitoc"> |
| <li> |
| <a href="#ch_Introduction">Introduction</a> |
| </li> |
| <li> |
| <a href="#sc_atomicBroadcast">Atomic Broadcast</a> |
| <ul class="minitoc"> |
| <li> |
| <a href="#sc_guaranteesPropertiesDefinitions">Guarantees, Properties, and Definitions</a> |
| </li> |
| <li> |
| <a href="#sc_leaderElection">Leader Activation</a> |
| </li> |
| <li> |
| <a href="#sc_activeMessaging">Active Messaging</a> |
| </li> |
| <li> |
| <a href="#sc_summary">Summary</a> |
| </li> |
| <li> |
| <a href="#sc_comparisons">Comparisons</a> |
| </li> |
| </ul> |
| </li> |
| <li> |
| <a href="#sc_quorum">Quorums</a> |
| </li> |
| <li> |
| <a href="#sc_logging">Logging</a> |
| <ul class="minitoc"> |
| <li> |
| <a href="#sc_developerGuidelines">Developer Guidelines</a> |
| <ul class="minitoc"> |
| <li> |
| <a href="#sc_rightLevel">Logging at the Right Level</a> |
| </li> |
| <li> |
| <a href="#sc_log4jIdioms">Use of Standard log4j Idioms</a> |
| </li> |
| </ul> |
| </li> |
| </ul> |
| </li> |
| </ul> |
| </div> |
| </div> |
| |
| |
| |
| |
| |
| <a name="ch_Introduction"></a> |
| <h2 class="h3">Introduction</h2> |
| <div class="section"> |
| <p>This document contains information on the inner workings of ZooKeeper. |
| So far, it discusses these topics: |
| </p> |
| <ul> |
| |
| <li> |
| <p> |
| <a href="#sc_atomicBroadcast">Atomic Broadcast</a> |
| </p> |
| </li> |
| |
| <li> |
| <p> |
| <a href="#sc_logging">Logging</a> |
| </p> |
| </li> |
| |
| </ul> |
| </div> |
| |
| |
| <a name="sc_atomicBroadcast"></a> |
| <h2 class="h3">Atomic Broadcast</h2> |
| <div class="section"> |
| <p> |
| At the heart of ZooKeeper is an atomic messaging system that keeps all of the servers in sync.</p> |
| <a name="sc_guaranteesPropertiesDefinitions"></a> |
| <h3 class="h4">Guarantees, Properties, and Definitions</h3> |
| <p> |
| The specific guarantees provided by the messaging system used by ZooKeeper are the following:</p> |
| <dl> |
| |
| |
| <dt> |
| <term> |
| <em>Reliable delivery</em> |
| </term> |
| </dt> |
| <dd> |
| <p>If a message, m, is delivered |
| by one server, it will be eventually delivered by all servers.</p> |
| </dd> |
| |
| |
| <dt> |
| <term> |
| <em>Total order</em> |
| </term> |
| </dt> |
| <dd> |
| <p> If a message is |
| delivered before message b by one server, a will be delivered before b by all |
| servers. If a and b are delivered messages, either a will be delivered before b |
| or b will be delivered before a.</p> |
| </dd> |
| |
| |
| <dt> |
| <term> |
| <em>Causal order</em> |
| </term> |
| </dt> |
| <dd> |
| <p> |
| If a message b is sent after a message a has been delivered by the sender of b, |
| a must be ordered before b. If a sender sends c after sending b, c must be ordered after b. |
| </p> |
| </dd> |
| |
| |
| </dl> |
| <p> |
| The ZooKeeper messaging system also needs to be efficient, reliable, and easy to |
| implement and maintain. We make heavy use of messaging, so we need the system to |
| be able to handle thousands of requests per second. Although we can require at |
| least k+1 correct servers to send new messages, we must be able to recover from |
| correlated failures such as power outages. When we implemented the system we had |
| little time and few engineering resources, so we needed a protocol that is |
| accessible to engineers and is easy to implement. We found that our protocol |
| satisfied all of these goals. |
| |
| </p> |
| <p> |
| Our protocol assumes that we can construct point-to-point FIFO channels between |
| the servers. While similar services usually assume message delivery that can |
| lose or reorder messages, our assumption of FIFO channels is very practical |
| given that we use TCP for communication. Specifically we rely on the following property of TCP:</p> |
| <dl> |
| |
| |
| <dt> |
| <term> |
| <em>Ordered delivery</em> |
| </term> |
| </dt> |
| <dd> |
| <p>Data is delivered in the same order it is sent and a message m is |
| delivered only after all messages sent before m have been delivered. |
| (The corollary to this is that if message m is lost all messages after m will be lost.)</p> |
| </dd> |
| |
| |
| <dt> |
| <term> |
| <em>No message after close</em> |
| </term> |
| </dt> |
| <dd> |
| <p>Once a FIFO channel is closed, no messages will be received from it.</p> |
| </dd> |
| |
| |
| </dl> |
| <p> |
| FLP proved that consensus cannot be achieved in asynchronous distributed systems |
| if failures are possible. To ensure we achieve consensus in the presence of failures |
| we use timeouts. However, we rely on times for liveness not for correctness. So, |
| if timeouts stop working (clocks malfunction for example) the messaging system may |
| hang, but it will not violate its guarantees.</p> |
| <p>When describing the ZooKeeper messaging protocol we will talk of packets, |
| proposals, and messages:</p> |
| <dl> |
| |
| <dt> |
| <term> |
| <em>Packet</em> |
| </term> |
| </dt> |
| <dd> |
| <p>a sequence of bytes sent through a FIFO channel</p> |
| </dd> |
| <dt> |
| <term> |
| <em>Proposal</em> |
| </term> |
| </dt> |
| <dd> |
| <p>a unit of agreement. Proposals are agreed upon by exchanging packets |
| with a quorum of ZooKeeper servers. Most proposals contain messages, however the |
| NEW_LEADER proposal is an example of a proposal that does not correspond to a message.</p> |
| </dd> |
| <dt> |
| <term> |
| <em>Message</em> |
| </term> |
| </dt> |
| <dd> |
| <p>a sequence of bytes to be atomically broadcast to all ZooKeeper |
| servers. A message put into a proposal and agreed upon before it is delivered.</p> |
| </dd> |
| |
| |
| </dl> |
| <p> |
| As stated above, ZooKeeper guarantees a total order of messages, and it also |
| guarantees a total order of proposals. ZooKeeper exposes the total ordering using |
| a ZooKeeper transaction id (<em>zxid</em>). All proposals will be stamped with a zxid when |
| it is proposed and exactly reflects the total ordering. Proposals are sent to all |
| ZooKeeper servers and committed when a quorum of them acknowledge the proposal. |
| If a proposal contains a message, the message will be delivered when the proposal |
| is committed. Acknowledgement means the server has recorded the proposal to persistent storage. |
| Our quorums have the requirement that any pair of quorum must have at least one server |
| in common. We ensure this by requiring that all quorums have size (<em>n/2+1</em>) where |
| n is the number of servers that make up a ZooKeeper service. |
| </p> |
| <p> |
| The zxid has two parts: the epoch and a counter. In our implementation the zxid |
| is a 64-bit number. We use the high order 32-bits for the epoch and the low order |
| 32-bits for the counter. Because it has two parts represent the zxid both as a |
| number and as a pair of integers, (<em>epoch, count</em>). The epoch number represents a |
| change in leadership. Each time a new leader comes into power it will have its |
| own epoch number. We have a simple algorithm to assign a unique zxid to a proposal: |
| the leader simply increments the zxid to obtain a unique zxid for each proposal. |
| <em>Leadership activation will ensure that only one leader uses a given epoch, so our |
| simple algorithm guarantees that every proposal will have a unique id.</em> |
| |
| </p> |
| <p> |
| ZooKeeper messaging consists of two phases:</p> |
| <dl> |
| |
| <dt> |
| <term> |
| <em>Leader activation</em> |
| </term> |
| </dt> |
| <dd> |
| <p>In this phase a leader establishes the correct state of the system |
| and gets ready to start making proposals.</p> |
| </dd> |
| |
| |
| <dt> |
| <term> |
| <em>Active messaging</em> |
| </term> |
| </dt> |
| <dd> |
| <p>In this phase a leader accepts messages to propose and coordinates message delivery.</p> |
| </dd> |
| |
| </dl> |
| <p> |
| ZooKeeper is a holistic protocol. We do not focus on individual proposals, rather |
| look at the stream of proposals as a whole. Our strict ordering allows us to do this |
| efficiently and greatly simplifies our protocol. Leadership activation embodies |
| this holistic concept. A leader becomes active only when a quorum of followers |
| (The leader counts as a follower as well. You can always vote for yourself ) has synced |
| up with the leader, they have the same state. This state consists of all of the |
| proposals that the leader believes have been committed and the proposal to follow |
| the leader, the NEW_LEADER proposal. (Hopefully you are thinking to |
| yourself, <em>Does the set of proposals that the leader believes has been committed |
| included all the proposals that really have been committed?</em> The answer is <em>yes</em>. |
| Below, we make clear why.) |
| </p> |
| <a name="sc_leaderElection"></a> |
| <h3 class="h4">Leader Activation</h3> |
| <p> |
| Leader activation includes leader election. We currently have two leader election |
| algorithms in ZooKeeper: LeaderElection and FastLeaderElection (AuthFastLeaderElection |
| is a variant of FastLeaderElection that uses UDP and allows servers to perform a simple |
| form of authentication to avoid IP spoofing). ZooKeeper messaging doesn't care about the |
| exact method of electing a leader has long as the following holds: |
| </p> |
| <ul> |
| |
| |
| <li> |
| <p>The leader has seen the highest zxid of all the followers.</p> |
| </li> |
| |
| <li> |
| <p>A quorum of servers have committed to following the leader.</p> |
| </li> |
| |
| |
| </ul> |
| <p> |
| Of these two requirements only the first, the highest zxid amoung the followers |
| needs to hold for correct operation. The second requirement, a quorum of followers, |
| just needs to hold with high probability. We are going to recheck the second requirement, |
| so if a failure happens during or after the leader election and quorum is lost, |
| we will recover by abandoning leader activation and running another election. |
| </p> |
| <p> |
| After leader election a single server will be designated as a leader and start |
| waiting for followers to connect. The rest of the servers will try to connect to |
| the leader. The leader will sync up with followers by sending any proposals they |
| are missing, or if a follower is missing too many proposals, it will send a full |
| snapshot of the state to the follower. |
| </p> |
| <p> |
| There is a corner case in which a follower that has proposals, U, not seen |
| by a leader arrives. Proposals are seen in order, so the proposals of U will have a zxids |
| higher than zxids seen by the leader. The follower must have arrived after the |
| leader election, otherwise the follower would have been elected leader given that |
| it has seen a higher zxid. Since committed proposals must be seen by a quorum of |
| servers, and a quorum of servers that elected the leader did not see U, the proposals |
| of you have not been committed, so they can be discarded. When the follower connects |
| to the leader, the leader will tell the follower to discard U. |
| </p> |
| <p> |
| A new leader establishes a zxid to start using for new proposals by getting the |
| epoch, e, of the highest zxid it has seen and setting the next zxid to use to be |
| (e+1, 0), fter the leader syncs with a follower, it will propose a NEW_LEADER |
| proposal. Once the NEW_LEADER proposal has been committed, the leader will activate |
| and start receiving and issuing proposals. |
| </p> |
| <p> |
| It all sounds complicated but here are the basic rules of operation during leader |
| activation: |
| </p> |
| <ul> |
| |
| <li> |
| <p>A follower will ACK the NEW_LEADER proposal after it has synced with the leader.</p> |
| </li> |
| |
| <li> |
| <p>A follower will only ACK a NEW_LEADER proposal with a given zxid from a single server.</p> |
| </li> |
| |
| <li> |
| <p>A new leader will COMMIT the NEW_LEADER proposal when a quorum of followers have ACKed it.</p> |
| </li> |
| |
| <li> |
| <p>A follower will commit any state it received from the leader when the NEW_LEADER proposal is COMMIT.</p> |
| </li> |
| |
| <li> |
| <p>A new leader will not accept new proposals until the NEW_LEADER proposal has been COMMITED.</p> |
| </li> |
| |
| </ul> |
| <p> |
| If leader election terminates erroneously, we don't have a problem since the |
| NEW_LEADER proposal will not be committed since the leader will not have quorum. |
| When this happens, the leader and any remaining followers will timeout and go back |
| to leader election. |
| </p> |
| <a name="sc_activeMessaging"></a> |
| <h3 class="h4">Active Messaging</h3> |
| <p> |
| Leader Activation does all the heavy lifting. Once the leader is coronated he can |
| start blasting out proposals. As long as he remains the leader no other leader can |
| emerge since no other leader will be able to get a quorum of followers. If a new |
| leader does emerge, |
| it means that the leader has lost quorum, and the new leader will clean up any |
| mess left over during her leadership activation. |
| </p> |
| <p>ZooKeeper messaging operates similar to a classic two-phase commit.</p> |
| <img alt="" src="images/2pc.jpg"><p> |
| All communication channels are FIFO, so everything is done in order. Specifically |
| the following operating constraints are observed:</p> |
| <ul> |
| |
| |
| <li> |
| <p>The leader sends proposals to all followers using |
| the same order. Moreover, this order follows the order in which requests have been |
| received. Because we use FIFO channels this means that followers also receive proposals in order. |
| </p> |
| </li> |
| |
| |
| <li> |
| <p>Followers process messages in the order they are received. This |
| means that messages will be ACKed in order and the leader will receive ACKs from |
| followers in order, due to the FIFO channels. It also means that if message $m$ |
| has been written to non-volatile storage, all messages that were proposed before |
| $m$ have been written to non-volatile storage.</p> |
| </li> |
| |
| |
| <li> |
| <p>The leader will issue a COMMIT to all followers as soon as a |
| quorum of followers have ACKed a message. Since messages are ACKed in order, |
| COMMITs will be sent by the leader as received by the followers in order.</p> |
| </li> |
| |
| |
| <li> |
| <p>COMMITs are processed in order. Followers deliver a proposals |
| message when that proposal is committed.</p> |
| </li> |
| |
| |
| </ul> |
| <a name="sc_summary"></a> |
| <h3 class="h4">Summary</h3> |
| <p>So there you go. Why does it work? Specifically, why does is set of proposals |
| believed by a new leader always contain any proposal that has actually been committed? |
| First, all proposals have a unique zxid, so unlike other protocols, we never have |
| to worry about two different values being proposed for the same zxid; followers |
| (a leader is also a follower) see and record proposals in order; proposals are |
| committed in order; there is only one active leader at a time since followers only |
| follow a single leader at a time; a new leader has seen all committed proposals |
| from the previous epoch since it has seen the highest zxid from a quorum of servers; |
| any uncommited proposals from a previous epoch seen by a new leader will be committed |
| by that leader before it becomes active.</p> |
| <a name="sc_comparisons"></a> |
| <h3 class="h4">Comparisons</h3> |
| <p> |
| Isn't this just Multi-Paxos? No, Multi-Paxos requires some way of assuring that |
| there is only a single coordinator. We do not count on such assurances. Instead |
| we use the leader activation to recover from leadership change or old leaders |
| believing they are still active. |
| </p> |
| <p> |
| Isn't this just Paxos? Your active messaging phase looks just like phase 2 of Paxos? |
| Actually, to us active messaging looks just like 2 phase commit without the need to |
| handle aborts. Active messaging is different from both in the sense that it has |
| cross proposal ordering requirements. If we do not maintain strict FIFO ordering of |
| all packets, it all falls apart. Also, our leader activation phase is different from |
| both of them. In particular, our use of epochs allows us to skip blocks of uncommitted |
| proposals and to not worry about duplicate proposals for a given zxid. |
| </p> |
| </div> |
| |
| |
| <a name="sc_quorum"></a> |
| <h2 class="h3">Quorums</h2> |
| <div class="section"> |
| <p> |
| Atomic broadcast and leader election use the notion of quorum to guarantee a consistent |
| view of the system. By default, ZooKeeper uses majority quorums, which means that every |
| voting that happens in one of these protocols requires a majority to vote on. One example is |
| acknowledging a leader proposal: the leader can only commit once it receives an |
| acknowledgement from a quorum of servers. |
| </p> |
| <p> |
| If we extract the properties that we really need from our use of majorities, we have that we only |
| need to guarantee that groups of processes used to validate an operation by voting (e.g., acknowledging |
| a leader proposal) pairwise intersect in at least one server. Using majorities guarantees such a property. |
| However, there are other ways of constructing quorums different from majorities. For example, we can assign |
| weights to the votes of servers, and say that the votes of some servers are more important. To obtain a quorum, |
| we get enough votes so that the sum of weights of all votes is larger than half of the total sum of all weights. |
| </p> |
| <p> |
| A different construction that uses weights and is useful in wide-area deployments (co-locations) is a hierarchical |
| one. With this construction, we split the servers into disjoint groups and assign weights to processes. To form |
| a quorum, we have to get a hold of enough servers from a majority of groups G, such that for each group g in G, |
| the sum of votes from g is larger than half of the sum of weights in g. Interestingly, this construction enables |
| smaller quorums. If we have, for example, 9 servers, we split them into 3 groups, and assign a weight of 1 to each |
| server, then we are able to form quorums of size 4. Note that two subsets of processes composed each of a majority |
| of servers from each of a majority of groups necessarily have a non-empty intersection. It is reasonable to expect |
| that a majority of co-locations will have a majority of servers available with high probability. |
| </p> |
| <p> |
| With ZooKeeper, we provide a user with the ability of configuring servers to use majority quorums, weights, or a |
| hierarchy of groups. |
| </p> |
| </div> |
| |
| |
| <a name="sc_logging"></a> |
| <h2 class="h3">Logging</h2> |
| <div class="section"> |
| <p> |
| ZooKeeper uses |
| <a href="http://logging.apache.org/log4j">log4j</a> |
| version 1.2 as its logging infrastructure. For information on configuring log4j for |
| ZooKeeper, see the <a href="zookeeperAdmin.html#sc_logging">Logging</a> section |
| of the <a href="zookeeperAdmin.html">ZooKeeper Administrator's Guide.</a> |
| |
| </p> |
| <a name="sc_developerGuidelines"></a> |
| <h3 class="h4">Developer Guidelines</h3> |
| <p>Please follow these guidelines when submitting code. Patch reviewers will look for the following:</p> |
| <a name="sc_rightLevel"></a> |
| <h4>Logging at the Right Level</h4> |
| <p> |
| There are <a href="http://logging.apache.org/log4j/1.2/apidocs/org/apache/log4j/Level.html#FATAL">6 levels of logging in log4j</a>. |
| It's important to pick the right one. In order of higher to lower severity:</p> |
| <ol> |
| |
| <li> |
| <p> FATAL level designates very severe error events that will presumably lead the application to abort</p> |
| </li> |
| |
| <li> |
| <p>ERROR level designates error events that might still allow the application to continue running.</p> |
| </li> |
| |
| <li> |
| <p>WARN level designates potentially harmful situations.</p> |
| </li> |
| |
| <li> |
| <p>INFO level designates informational messages that highlight the progress of the application at coarse-grained level.</p> |
| </li> |
| |
| <li> |
| <p>EBUG Level designates fine-grained informational events that are most useful to debug an application.</p> |
| </li> |
| |
| <li> |
| <p>TRACE Level designates finer-grained informational events than the DEBUG.</p> |
| </li> |
| |
| </ol> |
| <p> |
| ZooKeeper is typically run in production such that log messages of INFO level |
| severity and higher (more severe) are output to the log.</p> |
| <a name="sc_log4jIdioms"></a> |
| <h4>Use of Standard log4j Idioms</h4> |
| <p> |
| <em>Static Message Logging</em> |
| </p> |
| <pre class="code"> |
| LOG.debug("process completed successfully!"); |
| </pre> |
| <p>However when creating a message from a number of components (string |
| concatenation), the log call should be wrapped with a "isXEnabled()" call. this |
| eliminates the string concatenation overhead when debug level logging is not enabled. |
| </p> |
| <pre class="code"> |
| if (LOG.isDebugEnabled()) { |
| LOG.debug("got " + count + " messages in " + time + " minutes"); |
| } |
| </pre> |
| <p> |
| <em>Naming</em> |
| </p> |
| <p> |
| Loggers should be named after the class in which they are used. (See the |
| <a href="http://logging.apache.org/log4j/1.2/faq.html#2.4">log4j faq</a> |
| for reasons why this is a good idea.) |
| </p> |
| <pre class="code"> |
| public class Foo { |
| private static final Logger LOG = Logger.getLogger(Foo.class); |
| .... |
| public Foo() { |
| LOG.info("constructing Foo"); |
| </pre> |
| <p> |
| <em>Exception handling</em> |
| </p> |
| <pre class="code"> |
| try { |
| // code |
| } catch (XYZException e) { |
| // do this |
| LOG.error("Something bad happened", e); |
| // don't do this (generally) |
| // LOG.error(e); |
| // why? because "don't do" case hides the stack trace |
| |
| // continue process here as you need... recover or (re)throw |
| } |
| </pre> |
| </div> |
| |
| |
| <p align="right"> |
| <font size="-2"></font> |
| </p> |
| </div> |
| <!--+ |
| |end content |
| +--> |
| <div class="clearboth"> </div> |
| </div> |
| <div id="footer"> |
| <!--+ |
| |start bottomstrip |
| +--> |
| <div class="lastmodified"> |
| <script type="text/javascript"><!-- |
| document.write("Last Published: " + document.lastModified); |
| // --></script> |
| </div> |
| <div class="copyright"> |
| Copyright © |
| 2008 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a> |
| </div> |
| <!--+ |
| |end bottomstrip |
| +--> |
| </div> |
| </body> |
| </html> |