src/java/org/apache/cassandra/db/tries/InMemoryReadTrie.java - cassandra - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.cassandra.db.tries;

 import java.util.Arrays;
 import java.util.concurrent.atomic.AtomicReferenceArray;
 import java.util.function.Function;

 import org.agrona.concurrent.UnsafeBuffer;
 import org.apache.cassandra.utils.bytecomparable.ByteSource;
 import org.apache.cassandra.utils.bytecomparable.ByteComparable;

 /**
  * In-memory trie built for fast modification and reads executing concurrently with writes from a single mutator thread.
  *
  * This class provides the read-only functionality, expanded in {@link InMemoryTrie} to writes.
  */
 public class InMemoryReadTrie<T> extends Trie<T>
 {
     /*
     TRIE FORMAT AND NODE TYPES

     The memtable trie uses five different types of nodes:
      - "leaf" nodes, which have content and no children;
      - single-transition "chain" nodes, which have exactly one child; while each node is a single transition, they are
        called "chain" because multiple such transition are packed in a block.
      - "sparse" nodes which have between two and six children;
      - "split" nodes for anything above six children;
      - "prefix" nodes that augment one of the other types (except leaf) with content.

     The data for all nodes except leaf ones is stored in a contiguous 'node buffer' and laid out in blocks of 32 bytes.
     A block only contains data for a single type of node, but there is no direct correspondence between block and node
     in that:
      - a single block can contain multiple "chain" nodes.
      - a sparse node occupies exactly one block.
      - a split node occupies a variable number of blocks.
      - a prefix node can be placed in the same block as the node it augments, or in a separate block.

     Nodes are referenced in that buffer by an integer position/pointer, the 'node pointer'. Note that node pointers are
     not pointing at the beginning of blocks, and we call 'pointer offset' the offset of the node pointer to the block it
     points into. The value of a 'node pointer' is used to decide what kind of node is pointed:

      - If the pointer is negative, we have a leaf node. Since a leaf has no children, we need no data outside of its
        content to represent it, and that content is stored in a 'content list', not in the nodes buffer. The content
        of a particular leaf node is located at the ~pointer position in the content list (~ instead of - so that -1 can
        correspond to position 0).

      - If the 'pointer offset' is smaller than 28, we have a chain node with one transition. The transition character is
        the byte at the position pointed in the 'node buffer', and the child is pointed by:
        - the integer value at offset 28 of the block pointed if the 'pointer offset' is 27
        - pointer + 1 (which is guaranteed to have offset smaller than 28, i.e. to be a chain node), otherwise
        In other words, a chain block contains a sequence of characters that leads to the child whose address is at
        offset 28. It may have between 1 and 28 characters depending on the pointer with which the block is entered.

      - If the 'pointer offset' is 30, we have a sparse node. The data of a sparse node occupies a full block and is laid
        out as:
        - six pointers to children at offsets 0 to 24
        - six transition characters at offsets 24 to 30
        - an order word stored in the two bytes at offset 30
        To enable in-place addition of children, the pointers and transition characters are not stored ordered.
        Instead, we use an order encoding in the last 2 bytes of the node. The encoding is a base-6 number which
        describes the order of the transitions (least significant digit being the smallest).
        The node must have at least two transitions and the transition at position 0 is never the biggest (we can
        enforce this by choosing for position 0 the smaller of the two transitions a sparse node starts with). This
        allows iteration over the order word (which divides said word by 6 each step) to finish when the result becomes 0.

      - If the 'pointer offset' is 28, the node is a split one. Split nodes are dense, meaning that there is a direct
        mapping between a transition character and the address of the associated pointer, and new children can easily be
        added in place.
        Split nodes occupy multiple blocks, and a child is located by traversing 3 layers of pointers:
        - the first pointer is within the top-level block (the one pointed by the pointer) and points to a "mid" block.
          The top-level block has 4 such pointers to "mid" block, located between offset 16 and 32.
        - the 2nd pointer is within the "mid" block and points to a "tail" block. A "mid" block has 8 such pointers
          occupying the whole block.
        - the 3rd pointer is with the "tail" block and is the actual child pointer. Like "mid" block, there are 8 such
          pointers (so we finally address 4 * 8 * 8 = 256 children).
        To find a child, we thus need to know the index of the pointer to follow within the top-level block, the index
        of the one in the "mid" block and the index in the "tail" block. For that, we split the transition byte in a
        sequence of 2-3-3 bits:
        - the first 2 bits are the index in the top-level block;
        - the next 3 bits, the index in the "mid" block;
        - and the last 3 bits the index in the "tail" block.
        This layout allows the node to use the smaller fixed-size blocks (instead of 256*4 bytes for the whole character
        space) and also leaves some room in the head block (the 16 first bytes) for additional information (which we can
        use to store prefix nodes containing things like deletion times).
        One split node may need up to 1 + 4 + 4*8 blocks (1184 bytes) to store all its children.

      - If the pointer offset is 31, we have a prefix node. These are two types:
        -- Embedded prefix nodes occupy the free bytes in a chain or split node. The byte at offset 4 has the offset
           within the 32-byte block for the augmented node.
        -- Full prefix nodes have 0xFF at offset 4 and a pointer at 28, pointing to the augmented node.
        Both types contain an index for content at offset 0. The augmented node cannot be a leaf or NONE -- in the former
        case the leaf itself contains the content index, in the latter we use a leaf instead.
        The term "node" when applied to these is a bit of a misnomer as they are not presented as separate nodes during
        traversals. Instead, they augment a node, changing only its content. Internally we create a Node object for the
        augmented node and wrap a PrefixNode around it, which changes the `content()` method and routes all other
        calls to the augmented node's methods.

      When building a trie we first allocate the content, then create a chain node leading to it. While we only have
      single transitions leading to a chain node, we can expand that node (attaching a character and using pointer - 1)
      instead of creating a new one. When a chain node already has a child and needs a new one added we change the type
      (i.e. create a new node and remap the parent) to sparse with two children. When a six-child sparse node needs a new
      child, we switch to split.

      Blocks currently are not reused, because we do not yet have a mechanism to tell when readers are done with blocks
      they are referencing. This currently causes a very low overhead (because we change data in place with the only
      exception of nodes needing to change type) and is planned to be addressed later.

      For further descriptions and examples of the mechanics of the trie, see InMemoryTrie.md.
      */

     static final int BLOCK_SIZE = 32;

     // Biggest block offset that can contain a pointer.
     static final int LAST_POINTER_OFFSET = BLOCK_SIZE - 4;

     /*
      Block offsets used to identify node types (by comparing them to the node 'pointer offset').
      */

     // split node (dense, 2-3-3 transitions), laid out as 4 pointers to "mid" block, with has 8 pointers to "tail" block,
     // which has 8 pointers to children
     static final int SPLIT_OFFSET = BLOCK_SIZE - 4;
     // sparse node, unordered list of up to 6 transition, laid out as 6 transition pointers followed by 6 transition
     // bytes. The last two bytes contain an ordering of the transitions (in base-6) which is used for iteration. On
     // update the pointer is set last, i.e. during reads the node may show that a transition exists and list a character
     // for it, but pointer may still be null.
     static final int SPARSE_OFFSET = BLOCK_SIZE - 2;
     // min and max offset for a chain node. A block of chain node is laid out as a pointer at LAST_POINTER_OFFSET,
     // preceded by characters that lead to it. Thus a full chain block contains BLOCK_SIZE-4 transitions/chain nodes.
     static final int CHAIN_MIN_OFFSET = 0;
     static final int CHAIN_MAX_OFFSET = BLOCK_SIZE - 5;
     // Prefix node, an intermediate node augmenting its child node with content.
     static final int PREFIX_OFFSET = BLOCK_SIZE - 1;

     /*
      Offsets and values for navigating in a block for particular node type. Those offsets are 'from the node pointer'
      (not the block start) and can be thus negative since node pointers points towards the end of blocks.
      */

     // Limit for the starting cell / sublevel (2 bits -> 4 pointers).
     static final int SPLIT_START_LEVEL_LIMIT = 4;
     // Limit for the other sublevels (3 bits -> 8 pointers).
     static final int SPLIT_OTHER_LEVEL_LIMIT = 8;
     // Bitshift between levels.
     static final int SPLIT_LEVEL_SHIFT = 3;

     static final int SPARSE_CHILD_COUNT = 6;
     // Offset to the first child pointer of a spare node (laid out from the start of the block)
     static final int SPARSE_CHILDREN_OFFSET = 0 - SPARSE_OFFSET;
     // Offset to the first transition byte of a sparse node (laid out after the child pointers)
     static final int SPARSE_BYTES_OFFSET = SPARSE_CHILD_COUNT * 4 - SPARSE_OFFSET;
     // Offset to the order word of a sparse node (laid out after the children (pointer + transition byte))
     static final int SPARSE_ORDER_OFFSET = SPARSE_CHILD_COUNT * 5 - SPARSE_OFFSET;  // 0

     // Offset of the flag byte in a prefix node. In shared blocks, this contains the offset of the next node.
     static final int PREFIX_FLAGS_OFFSET = 4 - PREFIX_OFFSET;
     // Offset of the content id
     static final int PREFIX_CONTENT_OFFSET = 0 - PREFIX_OFFSET;
     // Offset of the next pointer in a non-shared prefix node
     static final int PREFIX_POINTER_OFFSET = LAST_POINTER_OFFSET - PREFIX_OFFSET;

     /**
      * Value used as null for node pointers.
      * No node can use this address (we enforce this by not allowing chain nodes to grow to position 0).
      * Do not change this as the code relies there being a NONE placed in all bytes of the block that are not set.
      */
     static final int NONE = 0;

     volatile int root;

     /*
      EXPANDABLE DATA STORAGE

      The tries will need more and more space in buffers and content lists as they grow. Instead of using ArrayList-like
      reallocation with copying, which may be prohibitively expensive for large buffers, we use a sequence of
      buffers/content arrays that double in size on every expansion.

      For a given address x the index of the buffer can be found with the following calculation:
         index_of_most_significant_set_bit(x / min_size + 1)
      (relying on sum (2^i) for i in [0, n-1] == 2^n - 1) which can be performed quickly on modern hardware.

      Finding the offset within the buffer is then
         x + min - (min << buffer_index)

      The allocated space starts 256 bytes for the buffer and 16 entries for the content list.

      Note that a buffer is not allowed to split 32-byte blocks (code assumes same buffer can be used for all bytes
      inside the block).
      */

     static final int BUF_START_SHIFT = 8;
     static final int BUF_START_SIZE = 1 << BUF_START_SHIFT;

     static final int CONTENTS_START_SHIFT = 4;
     static final int CONTENTS_START_SIZE = 1 << CONTENTS_START_SHIFT;

     static
     {
         assert BUF_START_SIZE % BLOCK_SIZE == 0 : "Initial buffer size must fit a full block.";
     }

     final UnsafeBuffer[] buffers;
     final AtomicReferenceArray<T>[] contentArrays;

     InMemoryReadTrie(UnsafeBuffer[] buffers, AtomicReferenceArray<T>[] contentArrays, int root)
     {
         this.buffers = buffers;
         this.contentArrays = contentArrays;
         this.root = root;
     }

     /*
      Buffer, content list and block management
      */
     int getChunkIdx(int pos, int minChunkShift, int minChunkSize)
     {
         return 31 - minChunkShift - Integer.numberOfLeadingZeros(pos + minChunkSize);
     }

     int inChunkPointer(int pos, int chunkIndex, int minChunkSize)
     {
         return pos + minChunkSize - (minChunkSize << chunkIndex);
     }

     UnsafeBuffer getChunk(int pos)
     {
         int leadBit = getChunkIdx(pos, BUF_START_SHIFT, BUF_START_SIZE);
         return buffers[leadBit];
     }

     int inChunkPointer(int pos)
     {
         int leadBit = getChunkIdx(pos, BUF_START_SHIFT, BUF_START_SIZE);
         return inChunkPointer(pos, leadBit, BUF_START_SIZE);
     }


     /**
      * Pointer offset for a node pointer.
      */
     int offset(int pos)
     {
         return pos & (BLOCK_SIZE - 1);
     }

     final int getUnsignedByte(int pos)
     {
         return getChunk(pos).getByte(inChunkPointer(pos)) & 0xFF;
     }

     final int getUnsignedShort(int pos)
     {
         return getChunk(pos).getShort(inChunkPointer(pos)) & 0xFFFF;
     }

     final int getInt(int pos)
     {
         return getChunk(pos).getInt(inChunkPointer(pos));
     }

     T getContent(int index)
     {
         int leadBit = getChunkIdx(index, CONTENTS_START_SHIFT, CONTENTS_START_SIZE);
         int ofs = inChunkPointer(index, leadBit, CONTENTS_START_SIZE);
         AtomicReferenceArray<T> array = contentArrays[leadBit];
         return array.get(ofs);
     }

     /*
      Reading node content
      */

     boolean isNull(int node)
     {
         return node == NONE;
     }

     boolean isLeaf(int node)
     {
         return node < NONE;
     }

     boolean isNullOrLeaf(int node)
     {
         return node <= NONE;
     }

     /**
      * Returns the number of transitions in a chain block entered with the given pointer.
      */
     private int chainBlockLength(int node)
     {
         return LAST_POINTER_OFFSET - offset(node);
     }

     /**
      * Get a node's child for the given transition character
      */
     int getChild(int node, int trans)
     {
         if (isNullOrLeaf(node))
             return NONE;

         node = followContentTransition(node);

         switch (offset(node))
         {
             case SPARSE_OFFSET:
                 return getSparseChild(node, trans);
             case SPLIT_OFFSET:
                 return getSplitChild(node, trans);
             case CHAIN_MAX_OFFSET:
                 if (trans != getUnsignedByte(node))
                     return NONE;
                 return getInt(node + 1);
             default:
                 if (trans != getUnsignedByte(node))
                     return NONE;
                 return node + 1;
         }
     }

     protected int followContentTransition(int node)
     {
         if (isNullOrLeaf(node))
             return NONE;

         if (offset(node) == PREFIX_OFFSET)
         {
             int b = getUnsignedByte(node + PREFIX_FLAGS_OFFSET);
             if (b < BLOCK_SIZE)
                 node = node - PREFIX_OFFSET + b;
             else
                 node = getInt(node + PREFIX_POINTER_OFFSET);

             assert node >= 0 && offset(node) != PREFIX_OFFSET;
         }
         return node;
     }

     /**
      * Advance as long as the cell pointed to by the given pointer will let you.
      * <p>
      * This is the same as getChild(node, first), except for chain nodes where it would walk the fill chain as long as
      * the input source matches.
      */
     int advance(int node, int first, ByteSource rest)
     {
         if (isNullOrLeaf(node))
             return NONE;

         node = followContentTransition(node);

         switch (offset(node))
         {
             case SPARSE_OFFSET:
                 return getSparseChild(node, first);
             case SPLIT_OFFSET:
                 return getSplitChild(node, first);
             default:
                 // Check the first byte matches the expected
                 if (getUnsignedByte(node++) != first)
                     return NONE;
                 // Check the rest of the bytes provided by the chain node
                 for (int length = chainBlockLength(node); length > 0; --length)
                 {
                     first = rest.next();
                     if (getUnsignedByte(node++) != first)
                         return NONE;
                 }
                 // All bytes matched, node is now positioned on the child pointer. Follow it.
                 return getInt(node);
         }
     }

     /**
      * Get the child for the given transition character, knowing that the node is sparse
      */
     int getSparseChild(int node, int trans)
     {
         for (int i = 0; i < SPARSE_CHILD_COUNT; ++i)
         {
             if (getUnsignedByte(node + SPARSE_BYTES_OFFSET + i) == trans)
             {
                 int child = getInt(node + SPARSE_CHILDREN_OFFSET + i * 4);

                 // we can't trust the transition character read above, because it may have been fetched before a
                 // concurrent update happened, and the update may have managed to modify the pointer by now.
                 // However, if we read it now that we have accessed the volatile pointer, it must have the correct
                 // value as it is set before the pointer.
                 if (child != NONE && getUnsignedByte(node + SPARSE_BYTES_OFFSET + i) == trans)
                     return child;
             }
         }
         return NONE;
     }

     /**
      * Given a transition, returns the corresponding index (within the node block) of the pointer to the mid block of
      * a split node.
      */
     int splitNodeMidIndex(int trans)
     {
         // first 2 bits of the 2-3-3 split
         return (trans >> 6) & 0x3;
     }

     /**
      * Given a transition, returns the corresponding index (within the mid block) of the pointer to the tail block of
      * a split node.
      */
     int splitNodeTailIndex(int trans)
     {
         // second 3 bits of the 2-3-3 split
         return (trans >> 3) & 0x7;
     }

     /**
      * Given a transition, returns the corresponding index (within the tail block) of the pointer to the child of
      * a split node.
      */
     int splitNodeChildIndex(int trans)
     {
         // third 3 bits of the 2-3-3 split
         return trans & 0x7;
     }

     /**
      * Get the child for the given transition character, knowing that the node is split
      */
     int getSplitChild(int node, int trans)
     {
         int mid = getSplitBlockPointer(node, splitNodeMidIndex(trans), SPLIT_START_LEVEL_LIMIT);
         if (isNull(mid))
             return NONE;

         int tail = getSplitBlockPointer(mid, splitNodeTailIndex(trans), SPLIT_OTHER_LEVEL_LIMIT);
         if (isNull(tail))
             return NONE;
         return getSplitBlockPointer(tail, splitNodeChildIndex(trans), SPLIT_OTHER_LEVEL_LIMIT);
     }

     /**
      * Get the content for a given node
      */
     T getNodeContent(int node)
     {
         if (isLeaf(node))
             return getContent(~node);

         if (offset(node) != PREFIX_OFFSET)
             return null;

         int index = getInt(node + PREFIX_CONTENT_OFFSET);
         return (index >= 0)
                ? getContent(index)
                : null;
     }

     int splitBlockPointerAddress(int node, int childIndex, int subLevelLimit)
     {
         return node - SPLIT_OFFSET + (8 - subLevelLimit + childIndex) * 4;
     }

     int getSplitBlockPointer(int node, int childIndex, int subLevelLimit)
     {
         return getInt(splitBlockPointerAddress(node, childIndex, subLevelLimit));
     }

     /**
      * Backtracking state for a cursor.
      *
      * To avoid allocations and pointer-chasing, the backtracking data is stored in a simple int array with
      * BACKTRACK_INTS_PER_ENTRY ints for each level.
      */
     private static class CursorBacktrackingState
     {
         static final int BACKTRACK_INTS_PER_ENTRY = 3;
         static final int BACKTRACK_INITIAL_SIZE = 16;
         private int[] backtrack = new int[BACKTRACK_INITIAL_SIZE * BACKTRACK_INTS_PER_ENTRY];
         int backtrackDepth = 0;

         void addBacktrack(int node, int data, int depth)
         {
             if (backtrackDepth * BACKTRACK_INTS_PER_ENTRY >= backtrack.length)
                 backtrack = Arrays.copyOf(backtrack, backtrack.length * 2);
             backtrack[backtrackDepth * BACKTRACK_INTS_PER_ENTRY + 0] = node;
             backtrack[backtrackDepth * BACKTRACK_INTS_PER_ENTRY + 1] = data;
             backtrack[backtrackDepth * BACKTRACK_INTS_PER_ENTRY + 2] = depth;
             ++backtrackDepth;
         }

         int node(int backtrackDepth)
         {
             return backtrack[backtrackDepth * BACKTRACK_INTS_PER_ENTRY + 0];
         }

         int data(int backtrackDepth)
         {
             return backtrack[backtrackDepth * BACKTRACK_INTS_PER_ENTRY + 1];
         }

         int depth(int backtrackDepth)
         {
             return backtrack[backtrackDepth * BACKTRACK_INTS_PER_ENTRY + 2];
         }
     }

     /*
      * Cursor implementation.
      *
      * InMemoryTrie cursors maintain their backtracking state in CursorBacktrackingState where they store
      * information about the node to backtrack to and the transitions still left to take or attempt.
      *
      * This information is different for the different types of node:
      * - for leaf and chain no backtracking is saved (because we know there are no further transitions)
      * - for sparse we store the remainder of the order word
      * - for split we store one entry per sub-level of the 2-3-3 split
      *
      * When the cursor is asked to advance it first checks the current node for children, and if there aren't any
      * (i.e. it is positioned on a leaf node), it goes one level up the backtracking chain, where we are guaranteed to
      * have a remaining child to advance to. When there's nothing to backtrack to, the trie is exhausted.
      */
     class MemtableCursor extends CursorBacktrackingState implements Cursor<T>
     {
         private int currentNode;
         private int incomingTransition;
         private T content;
         private int depth = -1;

         MemtableCursor()
         {
             descendInto(root, -1);
         }

         @Override
         public int advance()
         {
             if (isNullOrLeaf(currentNode))
                 return backtrack();
             else
                 return advanceToFirstChild(currentNode);
         }

         @Override
         public int advanceMultiple(TransitionsReceiver receiver)
         {
             int node = currentNode;
             if (!isChainNode(node))
                 return advance();

             // Jump directly to the chain's child.
             UnsafeBuffer chunk = getChunk(node);
             int inChunkNode = inChunkPointer(node);
             int bytesJumped = chainBlockLength(node) - 1;   // leave the last byte for incomingTransition
             if (receiver != null && bytesJumped > 0)
                 receiver.addPathBytes(chunk, inChunkNode, bytesJumped);
             depth += bytesJumped;    // descendInto will add one
             inChunkNode += bytesJumped;

             // inChunkNode is now positioned on the last byte of the chain.
             // Consume it to be the new state's incomingTransition.
             int transition = chunk.getByte(inChunkNode++) & 0xFF;
             // inChunkNode is now positioned on the child pointer.
             int child = chunk.getInt(inChunkNode);
             return descendInto(child, transition);
         }

         @Override
         public int skipChildren()
         {
             return backtrack();
         }

         @Override
         public int depth()
         {
             return depth;
         }

         @Override
         public T content()
         {
             return content;
         }

         @Override
         public int incomingTransition()
         {
             return incomingTransition;
         }

         private int backtrack()
         {
             if (--backtrackDepth < 0)
                 return depth = -1;

             depth = depth(backtrackDepth);
             return advanceToNextChild(node(backtrackDepth), data(backtrackDepth));
         }

         private int advanceToFirstChild(int node)
         {
             assert (!isNullOrLeaf(node));

             switch (offset(node))
             {
                 case SPLIT_OFFSET:
                     return descendInSplitSublevel(node, SPLIT_START_LEVEL_LIMIT, 0, SPLIT_LEVEL_SHIFT * 2);
                 case SPARSE_OFFSET:
                     return nextValidSparseTransition(node, getUnsignedShort(node + SPARSE_ORDER_OFFSET));
                 default:
                     return getChainTransition(node);
             }
         }

         private int advanceToNextChild(int node, int data)
         {
             assert (!isNullOrLeaf(node));

             switch (offset(node))
             {
                 case SPLIT_OFFSET:
                     return nextValidSplitTransition(node, data);
                 case SPARSE_OFFSET:
                     return nextValidSparseTransition(node, data);
                 default:
                     throw new AssertionError("Unexpected node type in backtrack state.");
             }
         }

         /**
          * Descend into the sub-levels of a split node. Advances to the first child and creates backtracking entries
          * for the following ones. We use the bits of trans (lowest non-zero ones) to identify which sub-level an
          * entry refers to.
          *
          * @param node The node or block id, must have offset SPLIT_OFFSET.
          * @param limit The transition limit for the current sub-level (4 for the start, 8 for the others).
          * @param collected The transition bits collected from the parent chain (e.g. 0x40 after following 1 on the top
          *                  sub-level).
          * @param shift This level's bit shift (6 for start, 3 for mid and 0 for tail).
          * @return the depth reached after descending.
          */
         private int descendInSplitSublevel(int node, int limit, int collected, int shift)
         {
             while (true)
             {
                 assert offset(node) == SPLIT_OFFSET;
                 int childIndex;
                 int child = NONE;
                 // find the first non-null child
                 for (childIndex = 0; childIndex < limit; ++childIndex)
                 {
                     child = getSplitBlockPointer(node, childIndex, limit);
                     if (!isNull(child))
                         break;
                 }
                 // there must be at least one child
                 assert childIndex < limit && child != NONE;

                 // look for any more valid transitions and add backtracking if found
                 maybeAddSplitBacktrack(node, childIndex, limit, collected, shift);

                 // add the bits just found
                 collected |= childIndex << shift;
                 // descend to next sub-level or child
                 if (shift == 0)
                     return descendInto(child, collected);
                 // continue with next sublevel; same as
                 // return descendInSplitSublevel(child + SPLIT_OFFSET, 8, collected, shift - 3)
                 node = child;
                 limit = SPLIT_OTHER_LEVEL_LIMIT;
                 shift -= SPLIT_LEVEL_SHIFT;
             }
         }

         /**
          * Backtrack to a split sub-level. The level is identified by the lowest non-0 bits in trans.
          */
         private int nextValidSplitTransition(int node, int trans)
         {
             assert trans >= 0 && trans <= 0xFF;
             int childIndex = splitNodeChildIndex(trans);
             if (childIndex > 0)
             {
                 maybeAddSplitBacktrack(node,
                                        childIndex,
                                        SPLIT_OTHER_LEVEL_LIMIT,
                                        trans & -(1 << (SPLIT_LEVEL_SHIFT * 1)),
                                        SPLIT_LEVEL_SHIFT * 0);
                 int child = getSplitBlockPointer(node, childIndex, SPLIT_OTHER_LEVEL_LIMIT);
                 return descendInto(child, trans);
             }
             int tailIndex = splitNodeTailIndex(trans);
             if (tailIndex > 0)
             {
                 maybeAddSplitBacktrack(node,
                                        tailIndex,
                                        SPLIT_OTHER_LEVEL_LIMIT,
                                        trans & -(1 << (SPLIT_LEVEL_SHIFT * 2)),
                                        SPLIT_LEVEL_SHIFT * 1);
                 int tail = getSplitBlockPointer(node, tailIndex, SPLIT_OTHER_LEVEL_LIMIT);
                 return descendInSplitSublevel(tail,
                                               SPLIT_OTHER_LEVEL_LIMIT,
                                               trans,
                                               SPLIT_LEVEL_SHIFT * 0);
             }
             int midIndex = splitNodeMidIndex(trans);
             assert midIndex > 0;
             maybeAddSplitBacktrack(node,
                                    midIndex,
                                    SPLIT_START_LEVEL_LIMIT,
                                    0,
                                    SPLIT_LEVEL_SHIFT * 2);
             int mid = getSplitBlockPointer(node, midIndex, SPLIT_START_LEVEL_LIMIT);
             return descendInSplitSublevel(mid,
                                           SPLIT_OTHER_LEVEL_LIMIT,
                                           trans,
                                           SPLIT_LEVEL_SHIFT * 1);
         }

         /**
          * Look for any further non-null transitions on this sub-level and, if found, add a backtracking entry.
          */
         private void maybeAddSplitBacktrack(int node, int startAfter, int limit, int collected, int shift)
         {
             int nextChildIndex;
             for (nextChildIndex = startAfter + 1; nextChildIndex < limit; ++nextChildIndex)
             {
                 if (!isNull(getSplitBlockPointer(node, nextChildIndex, limit)))
                     break;
             }
             if (nextChildIndex < limit)
                 addBacktrack(node, collected | (nextChildIndex << shift), depth);
         }

         private int nextValidSparseTransition(int node, int data)
         {
             UnsafeBuffer chunk = getChunk(node);
             int inChunkNode = inChunkPointer(node);

             // Peel off the next index.
             int index = data % SPARSE_CHILD_COUNT;
             data = data / SPARSE_CHILD_COUNT;

             // If there are remaining transitions, add backtracking entry.
             if (data > 0)
                 addBacktrack(node, data, depth);

             // Follow the transition.
             int child = chunk.getInt(inChunkNode + SPARSE_CHILDREN_OFFSET + index * 4);
             int transition = chunk.getByte(inChunkNode + SPARSE_BYTES_OFFSET + index) & 0xFF;
             return descendInto(child, transition);
         }

         private int getChainTransition(int node)
         {
             // No backtracking needed.
             UnsafeBuffer chunk = getChunk(node);
             int inChunkNode = inChunkPointer(node);
             int transition = chunk.getByte(inChunkNode) & 0xFF;
             int next = node + 1;
             if (offset(next) <= CHAIN_MAX_OFFSET)
                 return descendIntoChain(next, transition);
             else
                 return descendInto(chunk.getInt(inChunkNode + 1), transition);
         }

         private int descendInto(int child, int transition)
         {
             ++depth;
             incomingTransition = transition;
             content = getNodeContent(child);
             currentNode = followContentTransition(child);
             return depth;
         }

         private int descendIntoChain(int child, int transition)
         {
             ++depth;
             incomingTransition = transition;
             content = null;
             currentNode = child;
             return depth;
         }
     }

     private boolean isChainNode(int node)
     {
         return !isNullOrLeaf(node) && offset(node) <= CHAIN_MAX_OFFSET;
     }

     public MemtableCursor cursor()
     {
         return new MemtableCursor();
     }

     /*
      Direct read methods
      */

     /**
      * Get the content mapped by the specified key.
      * Fast implementation using integer node addresses.
      */
     public T get(ByteComparable path)
     {
         int n = root;
         ByteSource source = path.asComparableBytes(BYTE_COMPARABLE_VERSION);
         while (!isNull(n))
         {
             int c = source.next();
             if (c == ByteSource.END_OF_STREAM)
                 return getNodeContent(n);

             n = advance(n, c, source);
         }

         return null;
     }

     public boolean isEmpty()
     {
         return isNull(root);
     }

     /**
      * Override of dump to provide more detailed printout that includes the type of each node in the trie.
      * We do this via a wrapping cursor that returns a content string for the type of node for every node we return.
      */
     @Override
     public String dump(Function<T, String> contentToString)
     {
         MemtableCursor source = cursor();
         class TypedNodesCursor implements Cursor<String>
         {
             @Override
             public int advance()
             {
                 return source.advance();
             }


             @Override
             public int advanceMultiple(TransitionsReceiver receiver)
             {
                 return source.advanceMultiple(receiver);
             }

             @Override
             public int skipChildren()
             {
                 return source.skipChildren();
             }

             @Override
             public int depth()
             {
                 return source.depth();
             }

             @Override
             public int incomingTransition()
             {
                 return source.incomingTransition();
             }

             @Override
             public String content()
             {
                 String type = null;
                 int node = source.currentNode;
                 if (!isNullOrLeaf(node))
                 {
                     switch (offset(node))
                     {
                         case SPARSE_OFFSET:
                             type = "[SPARSE]";
                             break;
                         case SPLIT_OFFSET:
                             type = "[SPLIT]";
                             break;
                         case PREFIX_OFFSET:
                             throw new AssertionError("Unexpected prefix as cursor currentNode.");
                         default:
                             type = "[CHAIN]";
                             break;
                     }
                 }
                 T content = source.content();
                 if (content != null)
                 {
                     if (type != null)
                         return contentToString.apply(content) + " -> " + type;
                     else
                         return contentToString.apply(content);
                 }
                 else
                     return type;
             }
         }
         return process(new TrieDumper<>(Function.identity()), new TypedNodesCursor());
     }
 }