| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // |
| // This file implements a concurrent in-memory B-tree similar to the one |
| // described in the MassTree paper; |
| // "Cache Craftiness for Fast Multicore Key-Value Storage" |
| // Mao, Kohler, and Morris |
| // Eurosys 2012 |
| // |
| // This implementation is only the B-tree component, and not the "trie of trees" |
| // which make up their full data structure. In addition to this, there are |
| // some other key differences: |
| // - We do not support removal of elements from the tree -- in the Kudu memrowset |
| // use case, we use a deletion bit to indicate a removed record, and end up |
| // actually removing the storage at compaction time. |
| // - We do not support updating elements in the tree. Because we use MVCC, we |
| // only append new entries. A limited form of update is allowed in that data |
| // may be modified so long as the size is not changed. In that case, it is |
| // up to the user to provide concurrency control of the update (eg by using |
| // atomic operations or external locking) |
| // - The leaf nodes are linked together with a "next" pointer. This makes |
| // scanning simpler (the Masstree implementation avoids this because it |
| // complicates the removal operation) |
| // |
| // NOTE: this code disables TSAN for the most part. This is because it uses |
| // some "clever" concurrency mechanisms which are difficult to model in TSAN. |
| // We instead ensure correctness by heavy stress-testing. |
| #pragma once |
| |
| #include <algorithm> |
| #include <boost/smart_ptr/detail/yield_k.hpp> |
| #include <boost/utility/binary.hpp> |
| #include <memory> |
| #include <string> |
| |
| #include "kudu/gutil/macros.h" |
| #include "kudu/gutil/mathlimits.h" |
| #include "kudu/gutil/port.h" |
| #include "kudu/gutil/stringprintf.h" |
| #include "kudu/util/debug/sanitizer_scopes.h" |
| #include "kudu/util/inline_slice.h" |
| #include "kudu/util/logging.h" |
| #include "kudu/util/memory/arena.h" |
| #include "kudu/util/status.h" |
| |
| //#define TRAVERSE_PREFETCH |
| #define SCAN_PREFETCH |
| |
| |
| // Define the following to get an ugly printout on each node split |
| // to see how much of the node was actually being used. |
| // #define DEBUG_DUMP_SPLIT_STATS |
| |
| namespace kudu { namespace tablet { |
| namespace btree { |
| |
| // All CBTree implementation classes are templatized on a traits |
| // structure which customizes the implementation at compile-time. |
| // |
| // This default implementation should be reasonable for most usage. |
| struct BTreeTraits { |
| enum TraitConstants { |
| // Number of bytes used per internal node. |
| kInternalNodeSize = 4 * CACHELINE_SIZE, |
| |
| // Number of bytes used by a leaf node. |
| kLeafNodeSize = 4 * CACHELINE_SIZE, |
| |
| // Tests can set this trait to a non-zero value, which inserts |
| // some pause-loops in key parts of the code to try to simulate |
| // races. |
| kDebugRaciness = 0 |
| }; |
| typedef ThreadSafeArena ArenaType; |
| }; |
| |
| template<class T> |
| inline void PrefetchMemory(const T *addr) { |
| int size = std::min<int>(sizeof(T), 4 * CACHELINE_SIZE); |
| |
| for (int i = 0; i < size; i += CACHELINE_SIZE) { |
| prefetch(reinterpret_cast<const char *>(addr) + i, PREFETCH_HINT_T0); |
| } |
| } |
| |
| // Utility function that, when Traits::debug_raciness is non-zero |
| // (i.e only in debug code), will spin for some amount of time |
| // related to that setting. |
| // This can be used when trying to debug race conditions, but |
| // will compile away in production code. |
| template<class Traits> |
| void DebugRacyPoint() { |
| if (Traits::kDebugRaciness > 0) { |
| boost::detail::yield(Traits::kDebugRaciness); |
| } |
| } |
| |
| template<class Traits> class NodeBase; |
| template<class Traits> class InternalNode; |
| template<class Traits> class LeafNode; |
| template<class Traits> class PreparedMutation; |
| template<class Traits> class CBTree; |
| template<class Traits> class CBTreeIterator; |
| |
| typedef base::subtle::Atomic64 AtomicVersion; |
| |
| struct VersionField { |
| public: |
| static AtomicVersion StableVersion(volatile AtomicVersion *version) { |
| for (int loop_count = 0; true; loop_count++) { |
| AtomicVersion v_acq = base::subtle::Acquire_Load(version); |
| if (PREDICT_TRUE(!IsLocked(v_acq))) { |
| return v_acq; |
| } |
| boost::detail::yield(loop_count++); |
| } |
| } |
| |
| static void Lock(volatile AtomicVersion *version) { |
| int loop_count = 0; |
| |
| while (true) { |
| AtomicVersion v_acq = base::subtle::Acquire_Load(version); |
| if (PREDICT_TRUE(!IsLocked(v_acq))) { |
| AtomicVersion v_locked = SetLockBit(v_acq, 1); |
| if (PREDICT_TRUE(base::subtle::Acquire_CompareAndSwap(version, v_acq, v_locked) == v_acq)) { |
| return; |
| } |
| } |
| // Either was already locked by someone else, or CAS failed. |
| boost::detail::yield(loop_count++); |
| } |
| } |
| |
| static void Unlock(volatile AtomicVersion *version) { |
| // NoBarrier should be OK here, because no one else modifies the |
| // version while we have it locked. |
| AtomicVersion v = base::subtle::NoBarrier_Load(version); |
| |
| DCHECK(v & BTREE_LOCK_MASK); |
| |
| // If splitting, increment the splitting field |
| v += ((v & BTREE_SPLITTING_MASK) >> BTREE_SPLITTING_BIT) << BTREE_VSPLIT_SHIFT; |
| // If inserting, increment the insert field |
| v += ((v & BTREE_INSERTING_MASK) >> BTREE_INSERTING_BIT) << BTREE_VINSERT_SHIFT; |
| |
| // Get rid of the lock, flags and any overflow into the unused section. |
| v = SetLockBit(v, 0); |
| v &= ~(BTREE_UNUSED_MASK | BTREE_INSERTING_MASK | BTREE_SPLITTING_MASK); |
| |
| base::subtle::Release_Store(version, v); |
| } |
| |
| static uint64_t GetVSplit(AtomicVersion v) { |
| return v & BTREE_VSPLIT_MASK; |
| } |
| static uint64_t GetVInsert(AtomicVersion v) { |
| return (v & BTREE_VINSERT_MASK) >> BTREE_VINSERT_SHIFT; |
| } |
| static void SetSplitting(volatile AtomicVersion *v) { |
| base::subtle::Release_Store(v, *v | BTREE_SPLITTING_MASK); |
| } |
| static void SetInserting(volatile AtomicVersion *v) { |
| base::subtle::Release_Store(v, *v | BTREE_INSERTING_MASK); |
| } |
| static void SetLockedInsertingNoBarrier(volatile AtomicVersion *v) { |
| *v = VersionField::BTREE_LOCK_MASK | VersionField::BTREE_INSERTING_MASK; |
| } |
| |
| // Return true if the two version fields differ in more |
| // than just the lock status. |
| static bool IsDifferent(AtomicVersion v1, AtomicVersion v2) { |
| return PREDICT_FALSE((v1 & ~BTREE_LOCK_MASK) != (v2 & ~BTREE_LOCK_MASK)); |
| } |
| |
| // Return true if a split has occurred between the two versions |
| // or is currently in progress |
| static bool HasSplit(AtomicVersion v1, AtomicVersion v2) { |
| return PREDICT_FALSE((v1 & (BTREE_VSPLIT_MASK | BTREE_SPLITTING_MASK)) != |
| (v2 & (BTREE_VSPLIT_MASK | BTREE_SPLITTING_MASK))); |
| } |
| |
| static inline bool IsLocked(AtomicVersion v) { |
| return v & BTREE_LOCK_MASK; |
| } |
| static inline bool IsSplitting(AtomicVersion v) { |
| return v & BTREE_SPLITTING_MASK; |
| } |
| static inline bool IsInserting(AtomicVersion v) { |
| return v & BTREE_INSERTING_MASK; |
| } |
| |
| static std::string Stringify(AtomicVersion v) { |
| return StringPrintf("[flags=%c%c%c vins=%" PRIu64 " vsplit=%" PRIu64 "]", |
| (v & BTREE_LOCK_MASK) ? 'L':' ', |
| (v & BTREE_SPLITTING_MASK) ? 'S':' ', |
| (v & BTREE_INSERTING_MASK) ? 'I':' ', |
| GetVInsert(v), |
| GetVSplit(v)); |
| } |
| |
| private: |
| enum { |
| BTREE_LOCK_BIT = 63, |
| BTREE_SPLITTING_BIT = 62, |
| BTREE_INSERTING_BIT = 61, |
| BTREE_VINSERT_SHIFT = 27, |
| BTREE_VSPLIT_SHIFT = 0, |
| |
| #define BB(x) BOOST_BINARY(x) |
| BTREE_LOCK_MASK = |
| BB(10000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ), |
| BTREE_SPLITTING_MASK = |
| BB(01000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ), |
| BTREE_INSERTING_MASK = |
| BB(00100000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ), |
| |
| // There is one unused byte between the single-bit fields and the |
| // incremented fields. This allows us to efficiently increment the |
| // fields and avoid an extra instruction or two, since we don't need |
| // to worry about overflow. If vsplit overflows into vinsert, that's |
| // not a problem, since the vsplit change always results in a retry. |
| // If we roll over into this unused bit, we'll mask it out. |
| BTREE_UNUSED_MASK = |
| BB(00010000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 ), |
| BTREE_VINSERT_MASK = |
| BB(00001111 11111111 11111111 11111111 11111100 00000000 00000000 00000000 ), |
| BTREE_VSPLIT_MASK = |
| BB(00000000 00000000 00000000 00000000 00000011 11111111 11111111 11111111 ), |
| #undef BB |
| }; |
| |
| //Undeclared constructor - this is just static utilities. |
| VersionField(); |
| |
| static AtomicVersion SetLockBit(AtomicVersion v, int lock) { |
| DCHECK(lock == 0 || lock == 1); |
| v = v & ~BTREE_LOCK_MASK; |
| COMPILE_ASSERT(sizeof(AtomicVersion) == 8, must_use_64bit_version); |
| v |= (uint64_t)lock << BTREE_LOCK_BIT; |
| return v; |
| } |
| }; |
| |
| // Slice-like class for representing pointers to values in leaf nodes. |
| // This is used in preference to a normal Slice only so that it can have |
| // the same API as InlineSlice, and because it takes up less space |
| // inside of the tree leaves themselves. |
| // |
| // Stores the length of its data as the first sizeof(uintptr_t) bytes of |
| // the pointed-to data. |
| class ValueSlice { |
| private: |
| // We have to use a word-size field to store the length of the slice so |
| // that the user's data starts at a word-aligned address. |
| // Otherwise, the user could not use atomic operations on pointers inside |
| // their value (eg the mutation linked list in an MRS). |
| typedef uintptr_t size_type; |
| public: |
| Slice as_slice() const { |
| return Slice(ptr_ + sizeof(size_type), |
| *reinterpret_cast<const size_type*>(ptr_)); |
| } |
| |
| // Set this slice to a copy of 'src', allocated from alloc_arena. |
| // The copy will be word-aligned. No memory ordering is implied. |
| template<class ArenaType> |
| void set(const Slice& src, ArenaType* alloc_arena) { |
| uint8_t* in_arena = reinterpret_cast<uint8_t*>( |
| alloc_arena->AllocateBytesAligned(src.size() + sizeof(size_type), |
| sizeof(uint8_t*))); |
| // No special CAS/etc are necessary here, since anyone calling this holds the |
| // lock on the row. Concurrent readers never try to follow this pointer until |
| // they've gotten a consistent snapshot. |
| // |
| // (This is different than the keys, where concurrent tree traversers may |
| // actually try to follow the key indirection pointers from InlineSlice |
| // without copying a snapshot first). |
| DCHECK_LE(src.size(), MathLimits<size_type>::kMax) |
| << "Slice too large for btree"; |
| size_type size = src.size(); |
| memcpy(in_arena, &size, sizeof(size)); |
| memcpy(in_arena + sizeof(size), src.data(), src.size()); |
| ptr_ = const_cast<const uint8_t*>(in_arena); |
| } |
| |
| void prefetch() { |
| ::prefetch(reinterpret_cast<const char*>(ptr_), PREFETCH_HINT_T0); |
| } |
| |
| private: |
| const uint8_t* ptr_; |
| } PACKED; |
| |
| // Return the index of the first entry in the array which is |
| // >= the given value |
| template<size_t N> |
| size_t FindInSliceArray(const InlineSlice<N, true> *array, ssize_t num_entries, |
| const Slice &key, bool *exact) { |
| DCHECK_GE(num_entries, 0); |
| |
| if (PREDICT_FALSE(num_entries == 0)) { |
| *exact = false; |
| return 0; |
| } |
| |
| size_t left = 0; |
| size_t right = num_entries - 1; |
| |
| while (left < right) { |
| int mid = (left + right + 1) / 2; |
| // TODO: inline slices with more than 8 bytes will store a prefix of the |
| // slice inline, which we could use to short circuit some of these comparisons. |
| int compare = array[mid].as_slice().compare(key); |
| if (compare < 0) { // mid < key |
| left = mid; |
| } else if (compare > 0) { // mid > search |
| right = mid - 1; |
| } else { // mid == search |
| *exact = true; |
| return mid; |
| } |
| } |
| |
| int compare = array[left].as_slice().compare(key); |
| *exact = compare == 0; |
| if (compare < 0) { // key > left |
| left++; |
| } |
| return left; |
| } |
| |
| |
| template<class ISlice, class ArenaType> |
| static void InsertInSliceArray(ISlice *array, size_t num_entries, |
| const Slice &src, size_t idx, |
| ArenaType *arena) { |
| DCHECK_LT(idx, num_entries); |
| for (size_t i = num_entries - 1; i > idx; i--) { |
| array[i] = array[i - 1]; |
| } |
| array[idx].set(src, arena); |
| } |
| |
| |
| template<class Traits> |
| class NodeBase { |
| public: |
| AtomicVersion StableVersion() { |
| return VersionField::StableVersion(&version_); |
| } |
| |
| AtomicVersion AcquireVersion() { |
| return base::subtle::Acquire_Load(&version_); |
| } |
| |
| void Lock() { |
| VersionField::Lock(&version_); |
| } |
| |
| bool IsLocked() { |
| return VersionField::IsLocked(version_); |
| } |
| |
| void Unlock() { |
| VersionField::Unlock(&version_); |
| } |
| |
| void SetSplitting() { |
| VersionField::SetSplitting(&version_); |
| } |
| |
| void SetInserting() { |
| VersionField::SetInserting(&version_); |
| } |
| |
| // Return the parent node for this node, with the lock acquired. |
| InternalNode<Traits> *GetLockedParent() { |
| while (true) { |
| InternalNode<Traits> *ret = parent_; |
| if (ret == NULL) { |
| return NULL; |
| } |
| |
| ret->Lock(); |
| |
| if (PREDICT_FALSE(parent_ != ret)) { |
| // My parent changed after accomplishing the lock |
| ret->Unlock(); |
| continue; |
| } |
| |
| return ret; |
| } |
| } |
| |
| protected: |
| friend class CBTree<Traits>; |
| |
| NodeBase() : version_(0), parent_(NULL) |
| {} |
| |
| public: |
| volatile AtomicVersion version_; |
| |
| // parent_ field is protected not by this node's lock, but by |
| // the parent's lock. This allows reassignment of the parent_ |
| // field to occur after a split without gathering locks for all |
| // the children. |
| InternalNode<Traits> *parent_; |
| |
| private: |
| DISALLOW_COPY_AND_ASSIGN(NodeBase); |
| } PACKED; |
| |
| |
| |
| // Wrapper around a void pointer, which encodes the type |
| // of the pointed-to object in its most-significant-bit. |
| // The pointer may reference either an internal node or a |
| // leaf node. |
| // This assumes that the most significant bit of all valid pointers is |
| // 0, so that that bit can be used as storage. This is true on x86, where |
| // pointers are truly only 48-bit. |
| template<class T> |
| struct NodePtr { |
| enum NodeType { |
| INTERNAL_NODE, |
| LEAF_NODE |
| }; |
| |
| |
| NodePtr() : p_(NULL) {} |
| |
| NodePtr(InternalNode<T> *p) { // NOLINT(runtime/explicit) |
| uintptr_t p_int = reinterpret_cast<uintptr_t>(p); |
| DCHECK(!(p_int & kDiscriminatorBit)) << "Pointer must not use most significant bit"; |
| p_ = p; |
| } |
| |
| NodePtr(LeafNode<T> *p) { // NOLINT(runtime/explicit) |
| uintptr_t p_int = reinterpret_cast<uintptr_t>(p); |
| DCHECK(!(p_int & kDiscriminatorBit)) << "Pointer must not use most significant bit"; |
| p_ = reinterpret_cast<void *>(p_int | kDiscriminatorBit); |
| } |
| |
| NodeType type() { |
| DCHECK(p_ != NULL); |
| if (reinterpret_cast<uintptr_t>(p_) & kDiscriminatorBit) { |
| return LEAF_NODE; |
| } else { |
| return INTERNAL_NODE; |
| } |
| } |
| |
| bool is_null() { |
| return p_ == NULL; |
| } |
| |
| InternalNode<T> *internal_node_ptr() { |
| DCHECK_EQ(type(), INTERNAL_NODE); |
| return reinterpret_cast<InternalNode<T> *>(p_); |
| } |
| |
| LeafNode<T> *leaf_node_ptr() { |
| DCHECK_EQ(type(), LEAF_NODE); |
| return reinterpret_cast<LeafNode<T> *>( |
| reinterpret_cast<uintptr_t>(p_) & (~kDiscriminatorBit)); |
| } |
| |
| NodeBase<T> *base_ptr() { |
| DCHECK(!is_null()); |
| return reinterpret_cast<NodeBase<T> *>( |
| reinterpret_cast<uintptr_t>(p_) & (~kDiscriminatorBit)); |
| } |
| |
| void *p_; |
| |
| private: |
| enum { |
| kDiscriminatorBit = (1L << (sizeof(uintptr_t) * 8 - 1)) |
| }; |
| } PACKED; |
| |
| enum InsertStatus { |
| INSERT_SUCCESS, |
| INSERT_FULL, |
| INSERT_DUPLICATE |
| }; |
| |
| //////////////////////////////////////////////////////////// |
| // Internal node |
| //////////////////////////////////////////////////////////// |
| |
| template<class Traits> |
| class PACKED InternalNode : public NodeBase<Traits> { |
| public: |
| |
| // Construct a new internal node, containing the given children. |
| // This also reassigns the parent pointer of the children. |
| // Because other accessors of the tree may follow the children's |
| // parent pointers back up to discover a new root, and the parent |
| // pointers are covered by their parent's lock, this requires that |
| // the new internal node node is constructed in LOCKED state. |
| InternalNode(const Slice &split_key, |
| NodePtr<Traits> lchild, |
| NodePtr<Traits> rchild, |
| typename Traits::ArenaType* arena) |
| : num_children_(0) { |
| DCHECK_EQ(lchild.type(), rchild.type()) |
| << "Only expect to create a new internal node on account of a " |
| << "split: child nodes should have same type"; |
| |
| // Just assign the version, instead of using the proper ->Lock() |
| // since we don't need a CAS here. |
| VersionField::SetLockedInsertingNoBarrier(&this->version_); |
| |
| keys_[0].set(split_key, arena); |
| DCHECK_GT(split_key.size(), 0); |
| child_pointers_[0] = lchild; |
| child_pointers_[1] = rchild; |
| ReassignParent(lchild); |
| ReassignParent(rchild); |
| |
| num_children_ = 2; |
| } |
| |
| // Insert a new entry to the internal node. |
| // |
| // This is typically called after one of its child nodes has split. |
| InsertStatus Insert(const Slice &key, NodePtr<Traits> right_child, |
| typename Traits::ArenaType* arena) { |
| DCHECK(this->IsLocked()); |
| CHECK_GT(key.size(), 0); |
| |
| bool exact; |
| size_t idx = Find(key, &exact); |
| CHECK(!exact) |
| << "Trying to insert duplicate key " << KUDU_REDACT(key.ToDebugString()) |
| << " into an internal node! Internal node keys should result " |
| << " from splits and therefore be unique."; |
| |
| if (PREDICT_FALSE(num_children_ == kFanout)) { |
| return INSERT_FULL; |
| } |
| |
| // About to modify this node - flag it so that concurrent |
| // readers will retry. |
| this->SetInserting(); |
| |
| // Insert the key and child pointer in the right spot in the list |
| int new_num_children = num_children_ + 1; |
| InsertInSliceArray(keys_, new_num_children, key, idx, arena); |
| for (int i = new_num_children - 1; i > idx + 1; i--) { |
| child_pointers_[i] = child_pointers_[i - 1]; |
| } |
| child_pointers_[idx + 1] = right_child; |
| |
| base::subtle::Release_Store(reinterpret_cast<volatile Atomic32*>( |
| &num_children_), new_num_children); |
| |
| ReassignParent(right_child); |
| |
| return INSERT_SUCCESS; |
| } |
| |
| // Return the node index responsible for the given key. |
| // For example, if the key is less than the first discriminating |
| // node, returns 0. If it is between 0 and 1, returns 1, etc. |
| size_t Find(const Slice &key, bool *exact) { |
| return FindInSliceArray(keys_, key_count(), key, exact); |
| } |
| |
| // Find the child whose subtree may contain the given key. |
| // Note that this result may be an invalid or incorrect pointer if the |
| // caller has not locked the node, in which case OCC should be |
| // used to verify it after its usage. |
| NodePtr<Traits> FindChild(const Slice &key) { |
| bool exact; |
| size_t idx = Find(key, &exact); |
| if (exact) { |
| idx++; |
| } |
| return child_pointers_[idx]; |
| } |
| |
| Slice GetKey(size_t idx) const { |
| DCHECK_LT(idx, key_count()); |
| return keys_[idx].as_slice(); |
| } |
| |
| // Truncates the node, removing entries from the right to reduce |
| // to the new size. Also compacts the underlying storage so that all |
| // free space is contiguous, allowing for new inserts. |
| void Truncate(size_t new_num_keys) { |
| DCHECK(this->IsLocked()); |
| DCHECK(VersionField::IsSplitting(this->version_)); |
| DCHECK_GT(new_num_keys, 0); |
| |
| DCHECK_LT(new_num_keys, key_count()); |
| num_children_ = new_num_keys + 1; |
| |
| #ifndef NDEBUG |
| // This loop isn't necessary for correct operation, but nulling the pointers |
| // might help us catch bugs in debug mode. |
| for (int i = 0; i < num_children_; i++) { |
| DCHECK(!child_pointers_[i].is_null()); |
| } |
| for (int i = num_children_; i < kFanout; i++) { |
| // reset to NULL |
| child_pointers_[i] = NodePtr<Traits>(); |
| } |
| #endif |
| } |
| |
| std::string ToString() const { |
| std::string ret("["); |
| for (int i = 0; i < num_children_; i++) { |
| if (i > 0) { |
| ret.append(", "); |
| } |
| Slice k = keys_[i].as_slice(); |
| ret.append(KUDU_REDACT(k.ToDebugString())); |
| } |
| ret.append("]"); |
| return ret; |
| } |
| |
| private: |
| friend class CBTree<Traits>; |
| |
| void ReassignParent(NodePtr<Traits> child) { |
| child.base_ptr()->parent_ = this; |
| } |
| |
| int key_count() const { |
| // The node uses N keys to separate N+1 child pointers. |
| DCHECK_GT(num_children_, 0); |
| return num_children_ - 1; |
| } |
| |
| typedef InlineSlice<sizeof(void*), true> KeyInlineSlice; |
| |
| enum SpaceConstants { |
| constant_overhead = sizeof(NodeBase<Traits>) // base class |
| + sizeof(uint32_t), // num_children_ |
| keyptr_space = Traits::kInternalNodeSize - constant_overhead, |
| kFanout = keyptr_space / (sizeof(KeyInlineSlice) + sizeof(NodePtr<Traits>)) |
| }; |
| |
| // This ordering of members ensures KeyInlineSlices are properly aligned |
| // for atomic ops |
| KeyInlineSlice keys_[kFanout]; |
| NodePtr<Traits> child_pointers_[kFanout]; |
| uint32_t num_children_; |
| } PACKED; |
| |
| //////////////////////////////////////////////////////////// |
| // Leaf node |
| //////////////////////////////////////////////////////////// |
| |
| template<class Traits> |
| class LeafNode : public NodeBase<Traits> { |
| public: |
| // Construct a new leaf node. |
| // If initially_locked is true, then the new node is created |
| // with LOCKED and INSERTING set. |
| explicit LeafNode(bool initially_locked) |
| : next_(NULL), |
| num_entries_(0) { |
| if (initially_locked) { |
| // Just assign the version, instead of using the proper ->Lock() |
| // since we don't need a CAS here. |
| VersionField::SetLockedInsertingNoBarrier(&this->version_); |
| } |
| } |
| |
| int num_entries() const { return num_entries_; } |
| |
| void PrepareMutation(PreparedMutation<Traits> *ret) { |
| DCHECK(this->IsLocked()); |
| ret->leaf_ = this; |
| ret->idx_ = Find(ret->key(), &ret->exists_); |
| } |
| |
| // Insert a new entry into this leaf node. |
| InsertStatus Insert(PreparedMutation<Traits> *mut, const Slice &val) { |
| DCHECK_EQ(this, mut->leaf()); |
| DCHECK(this->IsLocked()); |
| |
| if (PREDICT_FALSE(mut->exists())) { |
| return INSERT_DUPLICATE; |
| } |
| |
| return InsertNew(mut->idx(), mut->key(), val, mut->arena()); |
| } |
| |
| // Insert an entry at the given index, which is guaranteed to be |
| // new. |
| InsertStatus InsertNew(size_t idx, const Slice &key, const Slice &val, |
| typename Traits::ArenaType* arena) { |
| if (PREDICT_FALSE(num_entries_ == kMaxEntries)) { |
| // Full due to metadata |
| return INSERT_FULL; |
| } |
| |
| DCHECK_LT(idx, kMaxEntries); |
| |
| this->SetInserting(); |
| |
| // The following inserts should always succeed because we |
| // verified that there is space available above. |
| num_entries_++; |
| InsertInSliceArray(keys_, num_entries_, key, idx, arena); |
| DebugRacyPoint<Traits>(); |
| InsertInSliceArray(vals_, num_entries_, val, idx, arena); |
| |
| return INSERT_SUCCESS; |
| } |
| |
| // Find the index of the first key which is >= the given |
| // search key. |
| // If the comparison is equal, then sets *exact to true. |
| // If no keys in the leaf are >= the given search key, |
| // then returns the size of the leaf node. |
| // |
| // Note that, if the lock is not held, this may return |
| // bogus results, in which case OCC must be used to verify. |
| size_t Find(const Slice &key, bool *exact) const { |
| return FindInSliceArray(keys_, num_entries_, key, exact); |
| } |
| |
| // Get the slice corresponding to the nth key. |
| // |
| // If the caller does not hold the lock, then this Slice |
| // may point to arbitrary data, and the result should be only |
| // trusted when verified by checking for conflicts. |
| Slice GetKey(size_t idx) const { |
| return keys_[idx].as_slice(); |
| } |
| |
| ValueSlice GetValue(size_t idx) const { |
| return vals_[idx]; |
| } |
| |
| // Get the slice corresponding to the nth key and value. |
| // |
| // If the caller does not hold the lock, then this Slice |
| // may point to arbitrary data, and the result should be only |
| // trusted when verified by checking for conflicts. |
| // |
| // NOTE: the value slice may include an *invalid pointer*, not |
| // just invalid data, so any readers should check for conflicts |
| // before accessing the value slice. |
| // The key, on the other hand, will always be a valid pointer, but |
| // may be invalid data. |
| void Get(size_t idx, Slice *k, ValueSlice *v) const { |
| *k = GetKey(idx); |
| *v = GetValue(idx); |
| } |
| |
| // Truncates the node, removing entries from the right to reduce |
| // to the new size. |
| // Caller must hold the node's lock with the SPLITTING flag set. |
| void Truncate(size_t new_num_entries) { |
| DCHECK(this->IsLocked()); |
| DCHECK(VersionField::IsSplitting(this->version_)); |
| |
| DCHECK_LT(new_num_entries, num_entries_); |
| num_entries_ = new_num_entries; |
| } |
| |
| std::string ToString() const { |
| std::string ret; |
| for (int i = 0; i < num_entries_; i++) { |
| if (i > 0) { |
| ret.append(", "); |
| } |
| Slice k = keys_[i].as_slice(); |
| Slice v = vals_[i].as_slice(); |
| ret.append("["); |
| ret.append(KUDU_REDACT(k.ToDebugString())); |
| ret.append("="); |
| ret.append(KUDU_REDACT(v.ToDebugString())); |
| ret.append("]"); |
| } |
| return ret; |
| } |
| |
| private: |
| friend class CBTree<Traits>; |
| friend class InternalNode<Traits>; |
| friend class CBTreeIterator<Traits>; |
| |
| typedef InlineSlice<sizeof(void*), true> KeyInlineSlice; |
| |
| // It is necessary to name this enum so that DCHECKs can use its |
| // constants (the macros may attempt to specialize templates |
| // with the constants, which require a named type). |
| enum SpaceConstants { |
| constant_overhead = sizeof(NodeBase<Traits>) // base class |
| + sizeof(LeafNode<Traits>*) // next_ |
| + sizeof(uint8_t), // num_entries_ |
| kv_space = Traits::kLeafNodeSize - constant_overhead, |
| kMaxEntries = kv_space / (sizeof(KeyInlineSlice) + sizeof(ValueSlice)) |
| }; |
| |
| // This ordering of members keeps KeyInlineSlices so pointers are aligned |
| LeafNode<Traits>* next_; |
| KeyInlineSlice keys_[kMaxEntries]; |
| ValueSlice vals_[kMaxEntries]; |
| uint8_t num_entries_; |
| } PACKED; |
| |
| |
| //////////////////////////////////////////////////////////// |
| // Tree API |
| //////////////////////////////////////////////////////////// |
| |
| // A "scoped" object which holds a lock on a leaf node. |
| // Instances should be prepared with CBTree::PrepareMutation() |
| // and then used with a further Insert() call. |
| template<class Traits> |
| class PreparedMutation { |
| public: |
| // Construct a PreparedMutation. |
| // |
| // The data referred to by the 'key' Slice passed in themust remain |
| // valid for the lifetime of the PreparedMutation object. |
| explicit PreparedMutation(Slice key) |
| : key_(key), tree_(NULL), leaf_(NULL), needs_unlock_(false) {} |
| |
| ~PreparedMutation() { |
| UnPrepare(); |
| } |
| |
| void Reset(const Slice& key) { |
| UnPrepare(); |
| key_ = key; |
| } |
| |
| // Prepare a mutation against the given tree. |
| // |
| // This prepared mutation may then be used with Insert(). |
| // In between preparing and executing the insert, the leaf node remains |
| // locked, so callers should endeavour to keep the critical section short. |
| // |
| // If the returned PreparedMutation object is not used with |
| // Insert(), it will be automatically unlocked by its destructor. |
| void Prepare(CBTree<Traits> *tree) { |
| debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan; |
| CHECK(!prepared()); |
| this->tree_ = tree; |
| this->arena_ = tree->arena_.get(); |
| tree->PrepareMutation(this); |
| needs_unlock_ = true; |
| } |
| |
| bool Insert(const Slice &val) { |
| CHECK(prepared()); |
| return tree_->Insert(this, val); |
| } |
| |
| // Return a slice referencing the existing data in the row. |
| // |
| // This is mutable data, but the size may not be changed. |
| // This can be used for updating in place if the new data |
| // has the same size as the original data. |
| Slice current_mutable_value() { |
| CHECK(prepared()); |
| Slice k; |
| ValueSlice v; |
| leaf_->Get(idx_, &k, &v); |
| leaf_->SetInserting(); |
| return v.as_slice(); |
| } |
| |
| // Accessors |
| |
| bool prepared() const { |
| return tree_ != NULL; |
| } |
| |
| // Return the key that was prepared. |
| const Slice &key() const { return key_; } |
| |
| CBTree<Traits> *tree() const { |
| return tree_; |
| } |
| |
| LeafNode<Traits> *leaf() const { |
| return CHECK_NOTNULL(leaf_); |
| } |
| |
| // Return true if the key that was prepared already exists. |
| bool exists() const { |
| return exists_; |
| } |
| |
| const size_t idx() const { |
| return idx_; |
| } |
| |
| typename Traits::ArenaType* arena() { |
| return arena_; |
| } |
| |
| private: |
| friend class CBTree<Traits>; |
| friend class LeafNode<Traits>; |
| friend class TestCBTree; |
| |
| DISALLOW_COPY_AND_ASSIGN(PreparedMutation); |
| |
| void mark_done() { |
| // set leaf_ back to NULL without unlocking it, |
| // since the caller will unlock it. |
| needs_unlock_ = false; |
| } |
| |
| void UnPrepare() { |
| if (leaf_ != NULL && needs_unlock_) { |
| leaf_->Unlock(); |
| needs_unlock_ = false; |
| } |
| tree_ = NULL; |
| } |
| |
| Slice key_; |
| CBTree<Traits> *tree_; |
| |
| // The arena where inserted data may be copied if the data is too |
| // large to fit entirely within a tree node. |
| typename Traits::ArenaType* arena_; |
| |
| LeafNode<Traits> *leaf_; |
| |
| size_t idx_; |
| bool exists_; |
| bool needs_unlock_; |
| }; |
| |
| |
| template<class Traits = BTreeTraits> |
| class CBTree { |
| public: |
| CBTree() : CBTree(std::make_shared<typename Traits::ArenaType>(4 * 1024)) { |
| } |
| |
| explicit CBTree(std::shared_ptr<typename Traits::ArenaType> arena) |
| : arena_(std::move(arena)), root_(NewLeaf(false)), frozen_(false) {} |
| |
| ~CBTree() { |
| RecursiveDelete(root_); |
| } |
| |
| |
| // Convenience API to insert an item. |
| // |
| // Returns true if successfully inserted, false if an item with the given |
| // key already exists. |
| // |
| // More advanced users can use the PreparedMutation class instead. |
| bool Insert(const Slice &key, const Slice &val) { |
| debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan; |
| PreparedMutation<Traits> mutation(key); |
| mutation.Prepare(this); |
| return mutation.Insert(val); |
| } |
| |
| void DebugPrint() const { |
| AtomicVersion v; |
| DebugPrint(StableRoot(&v), NULL, 0); |
| CHECK_EQ(root_.base_ptr()->AcquireVersion(), v) |
| << "Concurrent modification during DebugPrint not allowed"; |
| } |
| |
| enum GetResult { |
| GET_SUCCESS, |
| GET_NOT_FOUND, |
| GET_TOO_BIG |
| }; |
| |
| // Get a copy of the given key, storing the result in the |
| // provided buffer. |
| // Returns SUCCESS and sets *buf_len on success |
| // Returns NOT_FOUND if no such key is found |
| // Returns TOO_BIG if the key is too large to fit in the provided buffer. |
| // In this case, sets *buf_len to the required buffer size. |
| // |
| // TODO: this call probably won't be necessary in the final implementation |
| GetResult GetCopy(const Slice &key, char *buf, size_t *buf_len) const { |
| debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan; |
| size_t in_buf_len = *buf_len; |
| |
| retry_from_root: |
| { |
| AtomicVersion version; |
| LeafNode<Traits> *leaf = CHECK_NOTNULL(TraverseToLeaf(key, &version)); |
| |
| DebugRacyPoint(); |
| |
| retry_in_leaf: |
| { |
| GetResult ret; |
| Slice key_in_node; |
| ValueSlice val_in_node; |
| bool exact; |
| size_t idx = leaf->Find(key, &exact); |
| DebugRacyPoint(); |
| |
| if (!exact) { |
| ret = GET_NOT_FOUND; |
| } else { |
| leaf->Get(idx, &key_in_node, &val_in_node); |
| ret = GET_SUCCESS; |
| } |
| |
| // Got some kind of result, but may be based on racy data. |
| // Verify it. |
| AtomicVersion new_version = leaf->StableVersion(); |
| if (VersionField::HasSplit(version, new_version)) { |
| goto retry_from_root; |
| } else if (VersionField::IsDifferent(version, new_version)) { |
| version = new_version; |
| goto retry_in_leaf; |
| } |
| |
| // If we found a matching key earlier, and the read of the node |
| // wasn't racy, we can safely work with the ValueSlice. |
| if (ret == GET_SUCCESS) { |
| Slice val = val_in_node.as_slice(); |
| *buf_len = val.size(); |
| |
| if (PREDICT_FALSE(val.size() > in_buf_len)) { |
| ret = GET_TOO_BIG; |
| } else { |
| memcpy(buf, val.data(), val.size()); |
| } |
| } |
| return ret; |
| } |
| } |
| } |
| |
| // Returns true if the given key is contained in the tree. |
| // TODO: unit test |
| bool ContainsKey(const Slice &key) const { |
| debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan; |
| bool ret; |
| |
| retry_from_root: |
| { |
| AtomicVersion version; |
| LeafNode<Traits> *leaf = CHECK_NOTNULL(TraverseToLeaf(key, &version)); |
| |
| DebugRacyPoint(); |
| |
| retry_in_leaf: |
| { |
| leaf->Find(key, &ret); |
| DebugRacyPoint(); |
| |
| // Got some kind of result, but may be based on racy data. |
| // Verify it. |
| AtomicVersion new_version = leaf->StableVersion(); |
| if (VersionField::HasSplit(version, new_version)) { |
| goto retry_from_root; |
| } else if (VersionField::IsDifferent(version, new_version)) { |
| version = new_version; |
| goto retry_in_leaf; |
| } |
| return ret; |
| } |
| } |
| } |
| |
| CBTreeIterator<Traits> *NewIterator() const { |
| return new CBTreeIterator<Traits>(this, frozen_); |
| } |
| |
| // Return the current number of elements in the tree. |
| // |
| // Note that this requires iterating through the entire tree, |
| // so it is not very efficient. |
| size_t count() const { |
| std::unique_ptr<CBTreeIterator<Traits>> iter(NewIterator()); |
| bool exact; |
| iter->SeekAtOrAfter(Slice(""), &exact); |
| size_t count = 0; |
| while (iter->IsValid()) { |
| count++; |
| iter->Next(); |
| } |
| return count; |
| } |
| |
| // Return true if this tree contains no elements |
| bool empty() const { |
| NodePtr<Traits> root = root_; |
| switch (root.type()) { |
| case NodePtr<Traits>::INTERNAL_NODE: |
| // If there's already an internal node, then we've inserted some data. |
| // Because we don't remove, this means we definitely have data. |
| return false; |
| case NodePtr<Traits>::LEAF_NODE: |
| return root.leaf_node_ptr()->num_entries() == 0; |
| default: |
| CHECK(0) << "bad type"; |
| return true; |
| } |
| } |
| |
| size_t estimate_memory_usage() const { |
| return arena_->memory_footprint(); |
| } |
| |
| // Mark the tree as frozen. |
| // Once frozen, no further mutations may occur without triggering a CHECK |
| // violation. But, new iterators created after this point can scan more |
| // efficiently. |
| void Freeze() { |
| frozen_ = true; |
| } |
| |
| private: |
| friend class PreparedMutation<Traits>; |
| friend class CBTreeIterator<Traits>; |
| |
| DISALLOW_COPY_AND_ASSIGN(CBTree); |
| |
| NodePtr<Traits> StableRoot(AtomicVersion *stable_version) const { |
| while (true) { |
| NodePtr<Traits> node = root_; |
| NodeBase<Traits> *node_base = node.base_ptr(); |
| *stable_version = node_base->StableVersion(); |
| |
| if (PREDICT_TRUE(node_base->parent_ == NULL)) { |
| // Found a good root |
| return node; |
| } else { |
| // root has been swapped out |
| root_ = node_base->parent_; |
| } |
| } |
| } |
| |
| LeafNode<Traits> *TraverseToLeaf(const Slice &key, |
| AtomicVersion *stable_version) const { |
| retry_from_root: |
| AtomicVersion version = 0; |
| NodePtr<Traits> node = StableRoot(&version); |
| NodeBase<Traits> *node_base = node.base_ptr(); |
| |
| while (node.type() != NodePtr<Traits>::LEAF_NODE) { |
| #ifdef TRAVERSE_PREFETCH |
| PrefetchMemory(node.internal_node_ptr()); |
| #endif |
| retry_in_node: |
| int num_children = node.internal_node_ptr()->num_children_; |
| NodePtr<Traits> child = node.internal_node_ptr()->FindChild(key); |
| NodeBase<Traits> *child_base = NULL; |
| |
| AtomicVersion child_version = -1; |
| |
| if (PREDICT_TRUE(!child.is_null())) { |
| child_base = child.base_ptr(); |
| child_version = child_base->StableVersion(); |
| } |
| AtomicVersion new_node_version = node_base->AcquireVersion(); |
| |
| if (VersionField::IsDifferent(version, new_node_version)) { |
| new_node_version = node_base->StableVersion(); |
| |
| if (VersionField::HasSplit(version, new_node_version)) { |
| goto retry_from_root; |
| } else { |
| version = new_node_version; |
| goto retry_in_node; |
| } |
| } |
| int new_children = node.internal_node_ptr()->num_children_; |
| DCHECK(!child.is_null()) |
| << "should have changed versions when child was NULL: " |
| << "old version: " << VersionField::Stringify(version) |
| << " new version: " << VersionField::Stringify(new_node_version) |
| << " version now: " << VersionField::Stringify(node_base->AcquireVersion()) |
| << " num_children: " << num_children << " -> " << new_children; |
| |
| node = child; |
| node_base = child_base; |
| version = child_version; |
| } |
| #ifdef TRAVERSE_PREFETCH |
| PrefetchMemory(node.leaf_node_ptr()); |
| #endif |
| *stable_version = version; |
| return node.leaf_node_ptr(); |
| } |
| |
| void DebugRacyPoint() const { |
| btree::DebugRacyPoint<Traits>(); |
| } |
| |
| // Dump the tree. |
| // Requires that there are no concurrent modifications/ |
| void DebugPrint(NodePtr<Traits> node, |
| InternalNode<Traits> *expected_parent, |
| int indent) const { |
| |
| std::string buf; |
| switch (node.type()) { |
| case NodePtr<Traits>::LEAF_NODE: |
| { |
| LeafNode<Traits> *leaf = node.leaf_node_ptr(); |
| SStringPrintf(&buf, "%*sLEAF %p: ", indent, "", leaf); |
| buf.append(leaf->ToString()); |
| LOG(INFO) << buf; |
| CHECK_EQ(leaf->parent_, expected_parent) << "failed for " << leaf; |
| break; |
| } |
| case NodePtr<Traits>::INTERNAL_NODE: |
| { |
| InternalNode<Traits> *inode = node.internal_node_ptr(); |
| |
| SStringPrintf(&buf, "%*sINTERNAL %p: ", indent, "", inode); |
| LOG(INFO) << buf; |
| |
| for (int i = 0; i < inode->num_children_; i++) { |
| DebugPrint(inode->child_pointers_[i], inode, indent + 4); |
| if (i < inode->key_count()) { |
| SStringPrintf(&buf, "%*sKEY ", indent + 2, ""); |
| buf.append(KUDU_REDACT(inode->GetKey(i).ToDebugString())); |
| LOG(INFO) << buf; |
| } |
| } |
| CHECK_EQ(inode->parent_, expected_parent) << "failed for " << inode; |
| break; |
| } |
| default: |
| CHECK(0) << "bad node type"; |
| } |
| } |
| |
| void RecursiveDelete(NodePtr<Traits> node) { |
| switch (node.type()) { |
| case NodePtr<Traits>::LEAF_NODE: |
| FreeLeaf(node.leaf_node_ptr()); |
| break; |
| case NodePtr<Traits>::INTERNAL_NODE: |
| { |
| InternalNode<Traits> *inode = node.internal_node_ptr(); |
| for (int i = 0; i < inode->num_children_; i++) { |
| RecursiveDelete(inode->child_pointers_[i]); |
| inode->child_pointers_[i] = NodePtr<Traits>(); |
| } |
| FreeInternalNode(inode); |
| break; |
| } |
| default: |
| CHECK(0); |
| } |
| } |
| |
| void PrepareMutation(PreparedMutation<Traits> *mutation) { |
| DCHECK_EQ(mutation->tree(), this); |
| while (true) { |
| AtomicVersion stable_version; |
| LeafNode<Traits> *lnode = TraverseToLeaf(mutation->key(), &stable_version); |
| |
| lnode->Lock(); |
| if (VersionField::HasSplit(lnode->AcquireVersion(), stable_version)) { |
| // Retry traversal due to a split |
| lnode->Unlock(); |
| continue; |
| } |
| |
| lnode->PrepareMutation(mutation); |
| return; |
| } |
| } |
| |
| // Inserts the given key/value into the prepared leaf node. |
| // If the leaf node is already full, handles splitting it and |
| // propagating splits up the tree. |
| // |
| // Precondition: |
| // 'node' is locked |
| // Postcondition: |
| // 'node' is unlocked |
| bool Insert(PreparedMutation<Traits> *mutation, |
| const Slice &val) { |
| debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan; |
| CHECK(!frozen_); |
| CHECK_NOTNULL(mutation); |
| DCHECK_EQ(mutation->tree(), this); |
| |
| LeafNode<Traits> *node = mutation->leaf(); |
| DCHECK(node->IsLocked()); |
| |
| // After this function, the prepared mutation cannot be used |
| // again. |
| mutation->mark_done(); |
| |
| switch (node->Insert(mutation, val)) { |
| case INSERT_SUCCESS: |
| node->Unlock(); |
| return true; |
| case INSERT_DUPLICATE: |
| node->Unlock(); |
| return false; |
| case INSERT_FULL: |
| return SplitLeafAndInsertUp(mutation, val); |
| // SplitLeafAndInsertUp takes care of unlocking |
| default: |
| CHECK(0) << "Unexpected result"; |
| break; |
| } |
| CHECK(0) << "should not get here"; |
| return false; |
| } |
| |
| // Splits the node 'node', returning the newly created right-sibling |
| // internal node 'new_inode'. |
| // |
| // Locking conditions: |
| // Precondition: |
| // node is locked |
| // Postcondition: |
| // node is still locked and marked SPLITTING |
| // new_inode is locked and marked INSERTING |
| InternalNode<Traits> *SplitInternalNode(InternalNode<Traits> *node, |
| faststring *separator_key) { |
| DCHECK(node->IsLocked()); |
| //VLOG(2) << "splitting internal node " << node->GetKey(0).ToString(); |
| |
| // TODO: simplified implementation doesn't deal with splitting |
| // when there are very small internal nodes. |
| CHECK_GT(node->key_count(), 2) |
| << "TODO: currently only support splitting nodes with >2 keys"; |
| |
| // TODO: can we share code better between the node types here? |
| // perhaps by making this part of NodeBase, wrapping the K,V slice pair |
| // in a struct type, etc? |
| |
| // Pick the split point. The split point is the key which |
| // will be moved up into the parent node. |
| int split_point = node->key_count() / 2; |
| Slice sep_slice = node->GetKey(split_point); |
| DCHECK_GT(sep_slice.size(), 0) << |
| "got bad split key when splitting: " << node->ToString(); |
| |
| separator_key->assign_copy(sep_slice.data(), sep_slice.size()); |
| |
| // Example split: |
| // [ 0, 1, 2 ] |
| // / | | \ . |
| // [A] [B] [C] [D] |
| // |
| // split_point = 3/2 = 1 |
| // separator_key = 1 |
| // |
| // =====> |
| // |
| // [ 1 ] |
| // / | |
| // [ 0 ] [ 2 ] |
| // / | | \ . |
| // [A] [B] [C] [D] |
| // |
| |
| NodePtr<Traits> separator_ptr; |
| |
| InternalNode<Traits> *new_inode = NewInternalNode( |
| node->GetKey(split_point + 1), |
| node->child_pointers_[split_point + 1], |
| node->child_pointers_[split_point + 2]); |
| |
| // The new inode is constructed in locked and INSERTING state. |
| |
| // Copy entries to the new right-hand node. |
| for (int i = split_point + 2; i < node->key_count(); i++) { |
| Slice k = node->GetKey(i); |
| DCHECK_GT(k.size(), 0); |
| NodePtr<Traits> child = node->child_pointers_[i + 1]; |
| DCHECK(!child.is_null()); |
| |
| // TODO: this could be done more efficiently since we know that |
| // these inserts are coming in sorted order. |
| CHECK_EQ(INSERT_SUCCESS, new_inode->Insert(k, child, arena_.get())); |
| } |
| |
| // Up to this point, we haven't modified the left node, so concurrent |
| // reads were consistent. But, now we're about to actually mutate, |
| // so set the flag. |
| node->SetSplitting(); |
| |
| // Truncate the left node to remove the keys which have been |
| // moved to the right node |
| node->Truncate(split_point); |
| return new_inode; |
| } |
| |
| // Split the given leaf node 'node', creating a new node |
| // with the higher half of the elements. |
| // |
| // N.B: the new node is initially locked, but doesn't have the |
| // SPLITTING flag. This function sets the SPLITTING flag before |
| // modifying it. |
| void SplitLeafNode(LeafNode<Traits> *node, |
| LeafNode<Traits> **new_node) { |
| DCHECK(node->IsLocked()); |
| |
| #ifdef DEBUG_DUMP_SPLIT_STATS |
| do { |
| size_t key_size = 0, val_size = 0; |
| for (size_t i = 0; i < node->num_entries(); i++) { |
| Slice k, v; |
| node->Get(i, &k, &v); |
| key_size += k.size(); |
| val_size += v.size(); |
| } |
| LOG(INFO) << "split leaf. entries=" << node->num_entries() |
| << " keysize=" << key_size |
| << " valsize=" << val_size; |
| } while (0); |
| #endif |
| |
| LeafNode<Traits> *new_leaf = NewLeaf(true); |
| new_leaf->next_ = node->next_; |
| |
| // Copy half the keys from node into the new leaf |
| int copy_start = node->num_entries() / 2; |
| CHECK_GT(copy_start, 0) << |
| "Trying to split a node with 0 or 1 entries"; |
| |
| std::copy(node->keys_ + copy_start, node->keys_ + node->num_entries(), |
| new_leaf->keys_); |
| std::copy(node->vals_ + copy_start, node->vals_ + node->num_entries(), |
| new_leaf->vals_); |
| new_leaf->num_entries_ = node->num_entries() - copy_start; |
| |
| // Truncate the left node to remove the keys which have been |
| // moved to the right node. |
| node->SetSplitting(); |
| node->next_ = new_leaf; |
| node->Truncate(copy_start); |
| *new_node = new_leaf; |
| } |
| |
| |
| // Splits a leaf node which is full, adding the new sibling |
| // node to the tree. |
| // This recurses upward splitting internal nodes as necessary. |
| // The node should be locked on entrance to the function |
| // and will be unlocked upon exit. |
| bool SplitLeafAndInsertUp(PreparedMutation<Traits> *mutation, |
| const Slice &val) { |
| LeafNode<Traits> *node = mutation->leaf(); |
| Slice key = mutation->key_; |
| |
| // Leaf node should already be locked at this point |
| DCHECK(node->IsLocked()); |
| |
| //DebugPrint(); |
| |
| LeafNode<Traits> *new_leaf; |
| SplitLeafNode(node, &new_leaf); |
| |
| // The new leaf node is returned still locked. |
| DCHECK(new_leaf->IsLocked()); |
| |
| // Insert the key that we were originally trying to insert in the |
| // correct side post-split. |
| Slice split_key = new_leaf->GetKey(0); |
| LeafNode<Traits> *dst_leaf = (key.compare(split_key) < 0) ? node : new_leaf; |
| // Re-prepare the mutation after the split. |
| dst_leaf->PrepareMutation(mutation); |
| |
| CHECK_EQ(INSERT_SUCCESS, dst_leaf->Insert(mutation, val)) |
| << "node split at " << KUDU_REDACT(split_key.ToDebugString()) |
| << " did not result in enough space for key " << KUDU_REDACT(key.ToDebugString()) |
| << " in left node"; |
| |
| // Insert the new node into the parents. |
| PropagateSplitUpward(node, new_leaf, split_key); |
| |
| // NB: No ned to unlock nodes here, since it is done by the upward |
| // propagation path ('ascend' label in Figure 5 in the masstree paper) |
| |
| return true; |
| } |
| |
| // Assign the parent pointer of 'right', and insert it into the tree |
| // by propagating splits upward. |
| // Locking: |
| // Precondition: |
| // left and right are both locked |
| // left is marked SPLITTING |
| // Postcondition: |
| // parent is non-null |
| // parent is marked INSERTING |
| // left and right are unlocked |
| void PropagateSplitUpward(NodePtr<Traits> left_ptr, NodePtr<Traits> right_ptr, |
| const Slice &split_key) { |
| NodeBase<Traits> *left = left_ptr.base_ptr(); |
| NodeBase<Traits> *right = right_ptr.base_ptr(); |
| |
| DCHECK(left->IsLocked()); |
| DCHECK(right->IsLocked()); |
| |
| InternalNode<Traits> *parent = left->GetLockedParent(); |
| if (parent == NULL) { |
| // Node is the root - make new parent node |
| parent = NewInternalNode(split_key, left_ptr, right_ptr); |
| // Constructor also reassigns parents. |
| // root_ will be updated lazily by next traverser |
| left->Unlock(); |
| right->Unlock(); |
| parent->Unlock(); |
| return; |
| } |
| |
| // Parent exists. Try to insert |
| switch (parent->Insert(split_key, right_ptr, arena_.get())) { |
| case INSERT_SUCCESS: |
| { |
| VLOG(3) << "Inserted new entry into internal node " |
| << parent << " for " << KUDU_REDACT(split_key.ToDebugString()); |
| left->Unlock(); |
| right->Unlock(); |
| parent->Unlock(); |
| return; |
| } |
| case INSERT_FULL: |
| { |
| // Split the node in two |
| faststring sep_key(0); |
| InternalNode<Traits> *new_inode = SplitInternalNode(parent, &sep_key); |
| |
| DCHECK(new_inode->IsLocked()); |
| DCHECK(parent->IsLocked()) << "original should still be locked"; |
| |
| // Insert the new entry into the appropriate half. |
| Slice inode_split(sep_key); |
| InternalNode<Traits> *dst_inode = |
| (split_key.compare(inode_split) < 0) ? parent : new_inode; |
| |
| VLOG(2) << "Split internal node " << parent << " for insert of " |
| << KUDU_REDACT(split_key.ToDebugString()) << "[" << right << "]" |
| << " (split at " << KUDU_REDACT(inode_split.ToDebugString()) << ")"; |
| |
| CHECK_EQ(INSERT_SUCCESS, dst_inode->Insert(split_key, right_ptr, arena_.get())); |
| |
| left->Unlock(); |
| right->Unlock(); |
| PropagateSplitUpward(parent, new_inode, inode_split); |
| break; |
| } |
| default: |
| CHECK(0); |
| } |
| } |
| |
| LeafNode<Traits> *NewLeaf(bool locked) { |
| void *mem = CHECK_NOTNULL(arena_->AllocateBytesAligned(sizeof(LeafNode<Traits>), |
| sizeof(AtomicVersion))); |
| return new (mem) LeafNode<Traits>(locked); |
| } |
| |
| InternalNode<Traits> *NewInternalNode(const Slice &split_key, |
| NodePtr<Traits> lchild, |
| NodePtr<Traits> rchild) { |
| void *mem = CHECK_NOTNULL(arena_->AllocateBytesAligned(sizeof(InternalNode<Traits>), |
| sizeof(AtomicVersion))); |
| return new (mem) InternalNode<Traits>(split_key, lchild, rchild, arena_.get()); |
| } |
| |
| void FreeLeaf(LeafNode<Traits> *leaf) { |
| leaf->~LeafNode(); |
| // No need to actually free, since it came from the arena |
| } |
| |
| void FreeInternalNode(InternalNode<Traits> *node) { |
| node->~InternalNode(); |
| // No need to actually free, since it came from the arena |
| } |
| |
| std::shared_ptr<typename Traits::ArenaType> arena_; |
| |
| // marked 'mutable' because readers will lazy-update the root |
| // when they encounter a stale root pointer. |
| mutable NodePtr<Traits> root_; |
| |
| // If true, the tree is no longer mutable. Once a tree becomes |
| // frozen, it may not be un-frozen. If an iterator is created on |
| // a frozen tree, it will be more efficient. |
| bool frozen_; |
| }; |
| |
| template<class Traits> |
| class CBTreeIterator { |
| public: |
| bool SeekToStart() { |
| bool exact; |
| return SeekAtOrAfter(Slice(""), &exact); |
| } |
| |
| bool SeekAtOrAfter(const Slice &key, bool *exact) { |
| SeekToLeaf(key); |
| SeekInLeaf(key, exact, AT_OR_AFTER); |
| return IsValid(); |
| } |
| |
| bool SeekAtOrBefore(const Slice &key, bool *exact) { |
| SeekToLeaf(key); |
| SeekInLeaf(key, exact, AT_OR_BEFORE); |
| return IsValid(); |
| } |
| |
| bool IsValid() const { |
| return seeked_; |
| } |
| |
| bool Next() { |
| DCHECK(seeked_); |
| idx_in_leaf_++; |
| if (idx_in_leaf_ < leaf_to_scan_->num_entries()) { |
| return true; |
| } else { |
| return SeekNextLeaf(); |
| } |
| } |
| |
| void GetCurrentEntry(Slice *key, Slice *val) const { |
| DCHECK(seeked_); |
| ValueSlice val_slice; |
| leaf_to_scan_->Get(idx_in_leaf_, key, &val_slice); |
| *val = val_slice.as_slice(); |
| } |
| |
| Slice GetCurrentKey() const { |
| DCHECK(seeked_); |
| return leaf_to_scan_->GetKey(idx_in_leaf_); |
| } |
| |
| |
| Slice GetCurrentValue() const { |
| DCHECK(seeked_); |
| ValueSlice val_slice = leaf_to_scan_->GetValue(idx_in_leaf_); |
| return val_slice.as_slice(); |
| } |
| |
| |
| //////////////////////////////////////////////////////////// |
| // Advanced functions which expose some of the internal state |
| // of the iterator, allowing for limited "rewind" capability |
| // within a given leaf. |
| // |
| // Single leaf nodes are the unit of "snapshotting" of this iterator. |
| // Hence, within a leaf node, the caller may rewind arbitrarily, but once |
| // moving to the next leaf node, there is no way to go back to the prior |
| // leaf node without losing consistency. |
| //////////////////////////////////////////////////////////// |
| |
| // Return the number of entries, including the current one, remaining |
| // in the leaf. |
| // For example, if the leaf has three entries [A, B, C], and GetCurrentEntry |
| // would return 'A', then this will return 3. |
| size_t remaining_in_leaf() const { |
| DCHECK(seeked_); |
| return leaf_to_scan_->num_entries() - idx_in_leaf_; |
| } |
| |
| // Return the index of the iterator inside the current leaf node. |
| size_t index_in_leaf() const { |
| return idx_in_leaf_; |
| } |
| |
| // Rewind the iterator to the given index in the current leaf node, |
| // which was probably saved off from a previous call to |
| // remaining_in_leaf(). |
| // |
| // If Next() was called more times than remaining_in_leaf(), then |
| // this call will not be successful. |
| void RewindToIndexInLeaf(size_t new_index_in_leaf) { |
| DCHECK(seeked_); |
| DCHECK_LT(new_index_in_leaf, leaf_to_scan_->num_entries()); |
| idx_in_leaf_ = new_index_in_leaf; |
| } |
| |
| // Get the key at a specific leaf node |
| Slice GetKeyInLeaf(size_t idx) const { |
| DCHECK(seeked_); |
| return leaf_to_scan_->GetKey(idx); |
| } |
| |
| // Get the given indexed entry in the current leaf node. |
| void GetEntryInLeaf(size_t idx, Slice *key, Slice *val) { |
| DCHECK(seeked_); |
| DCHECK_LT(idx, leaf_to_scan_->num_entries()); |
| leaf_to_scan_->Get(idx, key, val); |
| } |
| |
| private: |
| friend class CBTree<Traits>; |
| |
| // Control the mode in SeekInLeaf. |
| // AT_OR_BEFORE will seek the first idx with key <= given key. |
| // AT_OR_AFTER will seek the first idx with key >= given key. |
| enum SeekMode { |
| AT_OR_BEFORE, |
| AT_OR_AFTER |
| }; |
| |
| CBTreeIterator(const CBTree<Traits> *tree, |
| bool tree_frozen) : |
| tree_(tree), |
| tree_frozen_(tree_frozen), |
| seeked_(false), |
| idx_in_leaf_(-1), |
| leaf_copy_(false), |
| leaf_to_scan_(&leaf_copy_) |
| {} |
| |
| bool SeekInLeaf(const Slice &key, bool *exact, SeekMode mode) { |
| DCHECK(seeked_); |
| int idx = leaf_to_scan_->Find(key, exact); |
| if (mode == AT_OR_BEFORE) { |
| // If do not contain key equal to the given key, the result should be idx - 1; |
| // If contain the key(exact == true), the result should be idx. |
| // There are two corner case: |
| // 1. If all records in all leftnode > key will get idx = 0. |
| // 2. If all records in all leftnode < key will get idx idx = num_entries. |
| // |
| // The accuracy of this mode depends on the fact that we didn't optimize the |
| // internal nodes with "smallest separator" key. After optimize this function |
| // needs refactoring. |
| if (*exact) { |
| idx_in_leaf_ = idx; |
| return true; |
| } |
| if (leaf_to_scan_->num_entries() == 0 || idx == 0) { |
| seeked_ = false; |
| return false; |
| } |
| idx_in_leaf_ = idx - 1; |
| } else { |
| DCHECK_EQ(mode, AT_OR_AFTER); |
| idx_in_leaf_ = idx; |
| if (idx_in_leaf_ == leaf_to_scan_->num_entries()) { |
| // not found in leaf, seek to start of next leaf if it exists. |
| return SeekNextLeaf(); |
| } |
| } |
| return true; |
| } |
| |
| void SeekToLeaf(const Slice &key) { |
| debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan; |
| retry_from_root: |
| { |
| AtomicVersion version; |
| LeafNode<Traits> *leaf = tree_->TraverseToLeaf(key, &version); |
| #ifdef SCAN_PREFETCH |
| PrefetchMemory(leaf->next_); |
| #endif |
| |
| // If the tree is frozen, we don't need to follow optimistic concurrency. |
| if (tree_frozen_) { |
| leaf_to_scan_ = leaf; |
| seeked_ = true; |
| return; |
| } |
| |
| retry_in_leaf: |
| { |
| memcpy(static_cast<void*>(&leaf_copy_), leaf, sizeof(leaf_copy_)); |
| |
| AtomicVersion new_version = leaf->StableVersion(); |
| if (VersionField::HasSplit(version, new_version)) { |
| goto retry_from_root; |
| } else if (VersionField::IsDifferent(version, new_version)) { |
| version = new_version; |
| goto retry_in_leaf; |
| } |
| // Got a consistent snapshot copy of the leaf node into |
| // leaf_copy_ |
| leaf_to_scan_ = &leaf_copy_; |
| } |
| } |
| seeked_ = true; |
| } |
| |
| bool SeekNextLeaf() { |
| debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan; |
| DCHECK(seeked_); |
| LeafNode<Traits> *next = leaf_to_scan_->next_; |
| if (PREDICT_FALSE(next == NULL)) { |
| seeked_ = false; |
| return false; |
| } |
| #ifdef SCAN_PREFETCH |
| PrefetchMemory(next->next_); |
| for (int i = 0; i < next->num_entries(); i++) { |
| next->vals_[i].prefetch(); |
| } |
| #endif |
| |
| // If the tree is frozen, we don't need to play optimistic concurrency |
| // games or make a defensive copy. |
| if (tree_frozen_) { |
| leaf_to_scan_ = next; |
| idx_in_leaf_ = 0; |
| return true; |
| } |
| |
| while (true) { |
| AtomicVersion version = next->StableVersion(); |
| memcpy(static_cast<void*>(&leaf_copy_), next, sizeof(leaf_copy_)); |
| AtomicVersion new_version = next->StableVersion(); |
| if (VersionField::IsDifferent(new_version, version)) { |
| version = new_version; |
| } else { |
| idx_in_leaf_ = 0; |
| leaf_to_scan_ = &leaf_copy_; |
| return true; |
| } |
| } |
| } |
| |
| const CBTree<Traits> *tree_; |
| |
| // If true, the tree we are scanning is completely frozen and we don't |
| // need to perform optimistic concurrency control or copies for safety. |
| bool tree_frozen_; |
| |
| bool seeked_; |
| size_t idx_in_leaf_; |
| |
| |
| LeafNode<Traits> leaf_copy_; |
| LeafNode<Traits> *leaf_to_scan_; |
| }; |
| |
| } // namespace btree |
| } // namespace tablet |
| } // namespace kudu |