src/bthread/mutex.cpp - brpc - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 // bthread - An M:N threading library to make applications more concurrent.

 // Date: Sun Aug  3 12:46:15 CST 2014

 #include <pthread.h>
 #include <execinfo.h>
 #include <dlfcn.h>                               // dlsym
 #include <fcntl.h>                               // O_RDONLY
 #include "butil/atomicops.h"
 #include "bvar/bvar.h"
 #include "bvar/collector.h"
 #include "butil/macros.h"                         // BAIDU_CASSERT
 #include "butil/containers/flat_map.h"
 #include "butil/iobuf.h"
 #include "butil/fd_guard.h"
 #include "butil/files/file.h"
 #include "butil/files/file_path.h"
 #include "butil/file_util.h"
 #include "butil/unique_ptr.h"
 #include "butil/third_party/murmurhash3/murmurhash3.h"
 #include "butil/logging.h"
 #include "butil/object_pool.h"
 #include "bthread/butex.h"                       // butex_*
 #include "bthread/processor.h"                   // cpu_relax, barrier
 #include "bthread/mutex.h"                       // bthread_mutex_t
 #include "bthread/sys_futex.h"
 #include "bthread/log.h"

 extern "C" {
 extern void* __attribute__((weak)) _dl_sym(void* handle, const char* symbol, void* caller);
 }
 extern int __attribute__((weak)) GetStackTrace(void** result, int max_depth, int skip_count);

 namespace bthread {
 // Warm up backtrace before main().
 void* dummy_buf[4];
 const int ALLOW_UNUSED dummy_bt = GetStackTrace
     ? GetStackTrace(dummy_buf, arraysize(dummy_buf), 0)
     : backtrace(dummy_buf, arraysize(dummy_buf));

 // For controlling contentions collected per second.
 static bvar::CollectorSpeedLimit g_cp_sl = BVAR_COLLECTOR_SPEED_LIMIT_INITIALIZER;

 const size_t MAX_CACHED_CONTENTIONS = 512;
 // Skip frames which are always same: the unlock function and submit_contention()
 const int SKIPPED_STACK_FRAMES = 2;

 struct SampledContention : public bvar::Collected {
     // time taken by lock and unlock, normalized according to sampling_range
     int64_t duration_ns;
     // number of samples, normalized according to to sampling_range
     double count;
     void* stack[26];      // backtrace.
     int nframes;          // #elements in stack

     // Implement bvar::Collected
     void dump_and_destroy(size_t round) override;
     void destroy() override;
     bvar::CollectorSpeedLimit* speed_limit() override { return &g_cp_sl; }

     size_t hash_code() const {
         if (nframes == 0) {
             return 0;
         }
         if (_hash_code == 0) {
             _hash_code = 1;
             uint32_t seed = nframes;
             butil::MurmurHash3_x86_32(stack, sizeof(void*) * nframes, seed, &_hash_code);
         }
         return _hash_code;
     }
 private:
 friend butil::ObjectPool<SampledContention>;
     SampledContention()
         : duration_ns(0), count(0), stack{NULL}, nframes(0), _hash_code(0) {}
     ~SampledContention() override = default;

     mutable uint32_t _hash_code; // For combining samples with hashmap.
 };

 BAIDU_CASSERT(sizeof(SampledContention) == 256, be_friendly_to_allocator);

 // Functor to compare contentions.
 struct ContentionEqual {
     bool operator()(const SampledContention* c1,
                     const SampledContention* c2) const {
         return c1->hash_code() == c2->hash_code() &&
             c1->nframes == c2->nframes &&
             memcmp(c1->stack, c2->stack, sizeof(void*) * c1->nframes) == 0;
     }
 };

 // Functor to hash contentions.
 struct ContentionHash {
     size_t operator()(const SampledContention* c) const {
         return c->hash_code();
     }
 };

 // The global context for contention profiler.
 class ContentionProfiler {
 public:
     typedef butil::FlatMap<SampledContention*, SampledContention*,
                           ContentionHash, ContentionEqual> ContentionMap;

     explicit ContentionProfiler(const char* name);
     ~ContentionProfiler();

     void dump_and_destroy(SampledContention* c);

     // Write buffered data into resulting file. If `ending' is true, append
     // content of /proc/self/maps and retry writing until buffer is empty.
     void flush_to_disk(bool ending);

     void init_if_needed();
 private:
     bool _init;  // false before first dump_and_destroy is called
     bool _first_write;      // true if buffer was not written to file yet.
     std::string _filename;  // the file storing profiling result.
     butil::IOBuf _disk_buf;  // temp buf before saving the file.
     ContentionMap _dedup_map; // combining same samples to make result smaller.
 };

 ContentionProfiler::ContentionProfiler(const char* name)
     : _init(false)
     , _first_write(true)
     , _filename(name) {
 }

 ContentionProfiler::~ContentionProfiler() {
     if (!_init) {
         // Don't write file if dump_and_destroy was never called. We may create
         // such instances in ContentionProfilerStart.
         return;
     }
     flush_to_disk(true);
 }

 void ContentionProfiler::init_if_needed() {
     if (!_init) {
         // Already output nanoseconds, always set cycles/second to 1000000000.
         _disk_buf.append("--- contention\ncycles/second=1000000000\n");
         CHECK_EQ(0, _dedup_map.init(1024, 60));
         _init = true;
     }
 }

 void ContentionProfiler::dump_and_destroy(SampledContention* c) {
     init_if_needed();
     // Categorize the contention.
     SampledContention** p_c2 = _dedup_map.seek(c);
     if (p_c2) {
         // Most contentions are caused by several hotspots, this should be
         // the common branch.
         SampledContention* c2 = *p_c2;
         c2->duration_ns += c->duration_ns;
         c2->count += c->count;
         c->destroy();
     } else {
         _dedup_map.insert(c, c);
     }
     if (_dedup_map.size() > MAX_CACHED_CONTENTIONS) {
         flush_to_disk(false);
     }
 }

 void ContentionProfiler::flush_to_disk(bool ending) {
     BT_VLOG << "flush_to_disk(ending=" << ending << ")";

     // Serialize contentions in _dedup_map into _disk_buf.
     if (!_dedup_map.empty()) {
         BT_VLOG << "dedup_map=" << _dedup_map.size();
         butil::IOBufBuilder os;
         for (ContentionMap::const_iterator
                  it = _dedup_map.begin(); it != _dedup_map.end(); ++it) {
             SampledContention* c = it->second;
             os << c->duration_ns << ' ' << (size_t)ceil(c->count) << " @";
             for (int i = SKIPPED_STACK_FRAMES; i < c->nframes; ++i) {
                 os << ' ' << (void*)c->stack[i];
             }
             os << '\n';
             c->destroy();
         }
         _dedup_map.clear();
         _disk_buf.append(os.buf());
     }

     // Append /proc/self/maps to the end of the contention file, required by
     // pprof.pl, otherwise the functions in sys libs are not interpreted.
     if (ending) {
         BT_VLOG << "Append /proc/self/maps";
         // Failures are not critical, don't return directly.
         butil::IOPortal mem_maps;
         const butil::fd_guard fd(open("/proc/self/maps", O_RDONLY));
         if (fd >= 0) {
             while (true) {
                 ssize_t nr = mem_maps.append_from_file_descriptor(fd, 8192);
                 if (nr < 0) {
                     if (errno == EINTR) {
                         continue;
                     }
                     PLOG(ERROR) << "Fail to read /proc/self/maps";
                     break;
                 }
                 if (nr == 0) {
                     _disk_buf.append(mem_maps);
                     break;
                 }
             }
         } else {
             PLOG(ERROR) << "Fail to open /proc/self/maps";
         }
     }
     // Write _disk_buf into _filename
     butil::File::Error error;
     butil::FilePath path(_filename);
     butil::FilePath dir = path.DirName();
     if (!butil::CreateDirectoryAndGetError(dir, &error)) {
         LOG(ERROR) << "Fail to create directory=`" << dir.value()
                    << "', " << error;
         return;
     }
     // Truncate on first write, append on later writes.
     int flag = O_APPEND;
     if (_first_write) {
         _first_write = false;
         flag = O_TRUNC;
     }
     butil::fd_guard fd(open(_filename.c_str(), O_WRONLY|O_CREAT|flag, 0666));
     if (fd < 0) {
         PLOG(ERROR) << "Fail to open " << _filename;
         return;
     }
     // Write once normally, write until empty in the end.
     do {
         ssize_t nw = _disk_buf.cut_into_file_descriptor(fd);
         if (nw < 0) {
             if (errno == EINTR) {
                 continue;
             }
             PLOG(ERROR) << "Fail to write into " << _filename;
             return;
         }
         BT_VLOG << "Write " << nw << " bytes into " << _filename;
     } while (!_disk_buf.empty() && ending);
 }

 // If contention profiler is on, this variable will be set with a valid
 // instance. NULL otherwise.
 BAIDU_CACHELINE_ALIGNMENT static ContentionProfiler* g_cp = NULL;
 // Need this version to solve an issue that non-empty entries left by
 // previous contention profilers should be detected and overwritten.
 static uint64_t g_cp_version = 0;
 // Protecting accesss to g_cp.
 static pthread_mutex_t g_cp_mutex = PTHREAD_MUTEX_INITIALIZER;

 // The map storing information for profiling pthread_mutex. Different from
 // bthread_mutex, we can't save stuff into pthread_mutex, we neither can
 // save the info in TLS reliably, since a mutex can be unlocked in a different
 // thread from the one locked (although rare)
 // This map must be very fast, since it's accessed inside the lock.
 // Layout of the map:
 //  * Align each entry by cacheline so that different threads do not collide.
 //  * Hash the mutex into the map by its address. If the entry is occupied,
 //    cancel sampling.
 // The canceling rate should be small provided that programs are unlikely to
 // lock a lot of mutexes simultaneously.
 const size_t MUTEX_MAP_SIZE = 1024;
 BAIDU_CASSERT((MUTEX_MAP_SIZE & (MUTEX_MAP_SIZE - 1)) == 0, must_be_power_of_2);
 struct BAIDU_CACHELINE_ALIGNMENT MutexMapEntry {
     butil::static_atomic<uint64_t> versioned_mutex;
     bthread_contention_site_t csite;
 };
 static MutexMapEntry g_mutex_map[MUTEX_MAP_SIZE] = {}; // zero-initialize

 void SampledContention::dump_and_destroy(size_t /*round*/) {
     if (g_cp) {
         // Must be protected with mutex to avoid race with deletion of ctx.
         // dump_and_destroy is called from dumping thread only so this mutex
         // is not contended at most of time.
         BAIDU_SCOPED_LOCK(g_cp_mutex);
         if (g_cp) {
             g_cp->dump_and_destroy(this);
             return;
         }
     }
     destroy();
 }

 void SampledContention::destroy() {
     _hash_code = 0;
     butil::return_object(this);
 }

 // Remember the conflict hashes for troubleshooting, should be 0 at most of time.
 static butil::static_atomic<int64_t> g_nconflicthash = BUTIL_STATIC_ATOMIC_INIT(0);
 static int64_t get_nconflicthash(void*) {
     return g_nconflicthash.load(butil::memory_order_relaxed);
 }

 // Start profiling contention.
 bool ContentionProfilerStart(const char* filename) {
     if (filename == NULL) {
         LOG(ERROR) << "Parameter [filename] is NULL";
         return false;
     }
     // g_cp is also the flag marking start/stop.
     if (g_cp) {
         return false;
     }

     // Create related global bvar lazily.
     static bvar::PassiveStatus<int64_t> g_nconflicthash_var
         ("contention_profiler_conflict_hash", get_nconflicthash, NULL);
     static bvar::DisplaySamplingRatio g_sampling_ratio_var(
         "contention_profiler_sampling_ratio", &g_cp_sl);

     // Optimistic locking. A not-used ContentionProfiler does not write file.
     std::unique_ptr<ContentionProfiler> ctx(new ContentionProfiler(filename));
     {
         BAIDU_SCOPED_LOCK(g_cp_mutex);
         if (g_cp) {
             return false;
         }
         g_cp = ctx.release();
         ++g_cp_version;  // invalidate non-empty entries that may exist.
     }
     return true;
 }

 // Stop contention profiler.
 void ContentionProfilerStop() {
     ContentionProfiler* ctx = NULL;
     if (g_cp) {
         std::unique_lock<pthread_mutex_t> mu(g_cp_mutex);
         if (g_cp) {
             ctx = g_cp;
             g_cp = NULL;
             mu.unlock();

             // make sure it's initialiazed in case no sample was gathered,
             // otherwise nothing will be written and succeeding pprof will fail.
             ctx->init_if_needed();
             // Deletion is safe because usages of g_cp are inside g_cp_mutex.
             delete ctx;
             return;
         }
     }
     LOG(ERROR) << "Contention profiler is not started!";
 }

 BUTIL_FORCE_INLINE bool
 is_contention_site_valid(const bthread_contention_site_t& cs) {
     return cs.sampling_range;
 }

 BUTIL_FORCE_INLINE void
 make_contention_site_invalid(bthread_contention_site_t* cs) {
     cs->sampling_range = 0;
 }

 // Replace pthread_mutex_lock and pthread_mutex_unlock:
 // First call to sys_pthread_mutex_lock sets sys_pthread_mutex_lock to the
 // real function so that next calls go to the real function directly. This
 // technique avoids calling pthread_once each time.
 typedef int (*MutexOp)(pthread_mutex_t*);
 int first_sys_pthread_mutex_lock(pthread_mutex_t* mutex);
 int first_sys_pthread_mutex_unlock(pthread_mutex_t* mutex);
 static MutexOp sys_pthread_mutex_lock = first_sys_pthread_mutex_lock;
 static MutexOp sys_pthread_mutex_unlock = first_sys_pthread_mutex_unlock;
 static pthread_once_t init_sys_mutex_lock_once = PTHREAD_ONCE_INIT;

 // dlsym may call malloc to allocate space for dlerror and causes contention
 // profiler to deadlock at boostraping when the program is linked with
 // libunwind. The deadlock bt:
 //   #0  0x00007effddc99b80 in __nanosleep_nocancel () at ../sysdeps/unix/syscall-template.S:81
 //   #1  0x00000000004b4df7 in butil::internal::SpinLockDelay(int volatile*, int, int) ()
 //   #2  0x00000000004b4d57 in SpinLock::SlowLock() ()
 //   #3  0x00000000004b4a63 in tcmalloc::ThreadCache::InitModule() ()
 //   #4  0x00000000004aa2b5 in tcmalloc::ThreadCache::GetCache() ()
 //   #5  0x000000000040c6c5 in (anonymous namespace)::do_malloc_no_errno(unsigned long) [clone.part.16] ()
 //   #6  0x00000000006fc125 in tc_calloc ()
 //   #7  0x00007effdd245690 in _dlerror_run (operate=operate@entry=0x7effdd245130 <dlsym_doit>, args=args@entry=0x7fff483dedf0) at dlerror.c:141
 //   #8  0x00007effdd245198 in __dlsym (handle=<optimized out>, name=<optimized out>) at dlsym.c:70
 //   #9  0x0000000000666517 in bthread::init_sys_mutex_lock () at bthread/mutex.cpp:358
 //   #10 0x00007effddc97a90 in pthread_once () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_once.S:103
 //   #11 0x000000000066649f in bthread::first_sys_pthread_mutex_lock (mutex=0xbaf880 <_ULx86_64_lock>) at bthread/mutex.cpp:366
 //   #12 0x00000000006678bc in pthread_mutex_lock_impl (mutex=0xbaf880 <_ULx86_64_lock>) at bthread/mutex.cpp:489
 //   #13 pthread_mutex_lock (__mutex=__mutex@entry=0xbaf880 <_ULx86_64_lock>) at bthread/mutex.cpp:751
 //   #14 0x00000000004c6ea1 in _ULx86_64_init () at x86_64/Gglobal.c:83
 //   #15 0x00000000004c44fb in _ULx86_64_init_local (cursor=0x7fff483df340, uc=0x7fff483def90) at x86_64/Ginit_local.c:47
 //   #16 0x00000000004b5012 in GetStackTrace(void**, int, int) ()
 //   #17 0x00000000004b2095 in tcmalloc::PageHeap::GrowHeap(unsigned long) ()
 //   #18 0x00000000004b23a3 in tcmalloc::PageHeap::New(unsigned long) ()
 //   #19 0x00000000004ad457 in tcmalloc::CentralFreeList::Populate() ()
 //   #20 0x00000000004ad628 in tcmalloc::CentralFreeList::FetchFromSpansSafe() ()
 //   #21 0x00000000004ad6a3 in tcmalloc::CentralFreeList::RemoveRange(void**, void**, int) ()
 //   #22 0x00000000004b3ed3 in tcmalloc::ThreadCache::FetchFromCentralCache(unsigned long, unsigned long) ()
 //   #23 0x00000000006fbb9a in tc_malloc ()
 // Call _dl_sym which is a private function in glibc to workaround the malloc
 // causing deadlock temporarily. This fix is hardly portable.

 static void init_sys_mutex_lock() {
 #if defined(OS_LINUX)
     // TODO: may need dlvsym when GLIBC has multiple versions of a same symbol.
     // http://blog.fesnel.com/blog/2009/08/25/preloading-with-multiple-symbol-versions
     if (_dl_sym) {
         sys_pthread_mutex_lock = (MutexOp)_dl_sym(RTLD_NEXT, "pthread_mutex_lock", (void*)init_sys_mutex_lock);
         sys_pthread_mutex_unlock = (MutexOp)_dl_sym(RTLD_NEXT, "pthread_mutex_unlock", (void*)init_sys_mutex_lock);
     } else {
         // _dl_sym may be undefined reference in some system, fallback to dlsym
         sys_pthread_mutex_lock = (MutexOp)dlsym(RTLD_NEXT, "pthread_mutex_lock");
         sys_pthread_mutex_unlock = (MutexOp)dlsym(RTLD_NEXT, "pthread_mutex_unlock");
     }
 #elif defined(OS_MACOSX)
     // TODO: look workaround for dlsym on mac
     sys_pthread_mutex_lock = (MutexOp)dlsym(RTLD_NEXT, "pthread_mutex_lock");
     sys_pthread_mutex_unlock = (MutexOp)dlsym(RTLD_NEXT, "pthread_mutex_unlock");
 #endif
 }

 // Make sure pthread functions are ready before main().
 const int ALLOW_UNUSED dummy = pthread_once(&init_sys_mutex_lock_once, init_sys_mutex_lock);

 int first_sys_pthread_mutex_lock(pthread_mutex_t* mutex) {
     pthread_once(&init_sys_mutex_lock_once, init_sys_mutex_lock);
     return sys_pthread_mutex_lock(mutex);
 }
 int first_sys_pthread_mutex_unlock(pthread_mutex_t* mutex) {
     pthread_once(&init_sys_mutex_lock_once, init_sys_mutex_lock);
     return sys_pthread_mutex_unlock(mutex);
 }

 inline uint64_t hash_mutex_ptr(const pthread_mutex_t* m) {
     return butil::fmix64((uint64_t)m);
 }

 // Mark being inside locking so that pthread_mutex calls inside collecting
 // code are never sampled, otherwise deadlock may occur.
 static __thread bool tls_inside_lock = false;

 // Speed up with TLS:
 //   Most pthread_mutex are locked and unlocked in the same thread. Putting
 //   contention information in TLS avoids collisions that may occur in
 //   g_mutex_map. However when user unlocks in another thread, the info cached
 //   in the locking thread is not removed, making the space bloated. We use a
 //   simple strategy to solve the issue: If a thread has enough thread-local
 //   space to store the info, save it, otherwise save it in g_mutex_map. For
 //   a program that locks and unlocks in the same thread and does not lock a
 //   lot of mutexes simulateneously, this strategy always uses the TLS.
 #ifndef DONT_SPEEDUP_PTHREAD_CONTENTION_PROFILER_WITH_TLS
 const int TLS_MAX_COUNT = 3;
 struct MutexAndContentionSite {
     pthread_mutex_t* mutex;
     bthread_contention_site_t csite;
 };
 struct TLSPthreadContentionSites {
     int count;
     uint64_t cp_version;
     MutexAndContentionSite list[TLS_MAX_COUNT];
 };
 static __thread TLSPthreadContentionSites tls_csites = {0,0,{}};
 #endif  // DONT_SPEEDUP_PTHREAD_CONTENTION_PROFILER_WITH_TLS

 // Guaranteed in linux/win.
 const int PTR_BITS = 48;

 inline bthread_contention_site_t*
 add_pthread_contention_site(pthread_mutex_t* mutex) {
     MutexMapEntry& entry = g_mutex_map[hash_mutex_ptr(mutex) & (MUTEX_MAP_SIZE - 1)];
     butil::static_atomic<uint64_t>& m = entry.versioned_mutex;
     uint64_t expected = m.load(butil::memory_order_relaxed);
     // If the entry is not used or used by previous profiler, try to CAS it.
     if (expected == 0 ||
         (expected >> PTR_BITS) != (g_cp_version & ((1 << (64 - PTR_BITS)) - 1))) {
         uint64_t desired = (g_cp_version << PTR_BITS) | (uint64_t)mutex;
         if (m.compare_exchange_strong(
                 expected, desired, butil::memory_order_acquire)) {
             return &entry.csite;
         }
     }
     g_nconflicthash.fetch_add(1, butil::memory_order_relaxed);
     return NULL;
 }

 inline bool remove_pthread_contention_site(
     pthread_mutex_t* mutex, bthread_contention_site_t* saved_csite) {
     MutexMapEntry& entry = g_mutex_map[hash_mutex_ptr(mutex) & (MUTEX_MAP_SIZE - 1)];
     butil::static_atomic<uint64_t>& m = entry.versioned_mutex;
     if ((m.load(butil::memory_order_relaxed) & ((((uint64_t)1) << PTR_BITS) - 1))
         != (uint64_t)mutex) {
         // This branch should be the most common case since most locks are
         // neither contended nor sampled. We have one memory indirection and
         // several bitwise operations here, the cost should be ~ 5-50ns
         return false;
     }
     // Although this branch is inside a contended lock, we should also make it
     // as simple as possible because altering the critical section too much
     // may make unpredictable impact to thread interleaving status, which
     // makes profiling result less accurate.
     *saved_csite = entry.csite;
     make_contention_site_invalid(&entry.csite);
     m.store(0, butil::memory_order_release);
     return true;
 }

 // Submit the contention along with the callsite('s stacktrace)
 void submit_contention(const bthread_contention_site_t& csite, int64_t now_ns) {
     tls_inside_lock = true;
     auto sc = butil::get_object<SampledContention>();
     // Normalize duration_us and count so that they're addable in later
     // processings. Notice that sampling_range is adjusted periodically by
     // collecting thread.
     sc->duration_ns = csite.duration_ns * bvar::COLLECTOR_SAMPLING_BASE
         / csite.sampling_range;
     sc->count = bvar::COLLECTOR_SAMPLING_BASE / (double)csite.sampling_range;
     sc->nframes = GetStackTrace
         ? GetStackTrace(sc->stack, arraysize(sc->stack), 0)
         : backtrace(sc->stack, arraysize(sc->stack)); // may lock
     sc->submit(now_ns / 1000);  // may lock
     tls_inside_lock = false;
 }

 BUTIL_FORCE_INLINE int pthread_mutex_lock_impl(pthread_mutex_t* mutex) {
     // Don't change behavior of lock when profiler is off.
     if (!g_cp ||
         // collecting code including backtrace() and submit() may call
         // pthread_mutex_lock and cause deadlock. Don't sample.
         tls_inside_lock) {
         return sys_pthread_mutex_lock(mutex);
     }
     // Don't slow down non-contended locks.
     int rc = pthread_mutex_trylock(mutex);
     if (rc != EBUSY) {
         return rc;
     }
     // Ask bvar::Collector if this (contended) locking should be sampled
     const size_t sampling_range = bvar::is_collectable(&g_cp_sl);

     bthread_contention_site_t* csite = NULL;
 #ifndef DONT_SPEEDUP_PTHREAD_CONTENTION_PROFILER_WITH_TLS
     TLSPthreadContentionSites& fast_alt = tls_csites;
     if (fast_alt.cp_version != g_cp_version) {
         fast_alt.cp_version = g_cp_version;
         fast_alt.count = 0;
     }
     if (fast_alt.count < TLS_MAX_COUNT) {
         MutexAndContentionSite& entry = fast_alt.list[fast_alt.count++];
         entry.mutex = mutex;
         csite = &entry.csite;
         if (!sampling_range) {
             make_contention_site_invalid(&entry.csite);
             return sys_pthread_mutex_lock(mutex);
         }
     }
 #endif
     if (!sampling_range) {  // don't sample
         return sys_pthread_mutex_lock(mutex);
     }
     // Lock and monitor the waiting time.
     const int64_t start_ns = butil::cpuwide_time_ns();
     rc = sys_pthread_mutex_lock(mutex);
     if (!rc) { // Inside lock
         if (!csite) {
             csite = add_pthread_contention_site(mutex);
             if (csite == NULL) {
                 return rc;
             }
         }
         csite->duration_ns = butil::cpuwide_time_ns() - start_ns;
         csite->sampling_range = sampling_range;
     } // else rare
     return rc;
 }

 BUTIL_FORCE_INLINE int pthread_mutex_unlock_impl(pthread_mutex_t* mutex) {
     // Don't change behavior of unlock when profiler is off.
     if (!g_cp || tls_inside_lock) {
         // This branch brings an issue that an entry created by
         // add_pthread_contention_site may not be cleared. Thus we add a
         // 16-bit rolling version in the entry to find out such entry.
         return sys_pthread_mutex_unlock(mutex);
     }
     int64_t unlock_start_ns = 0;
     bool miss_in_tls = true;
     bthread_contention_site_t saved_csite = {0,0};
 #ifndef DONT_SPEEDUP_PTHREAD_CONTENTION_PROFILER_WITH_TLS
     TLSPthreadContentionSites& fast_alt = tls_csites;
     for (int i = fast_alt.count - 1; i >= 0; --i) {
         if (fast_alt.list[i].mutex == mutex) {
             if (is_contention_site_valid(fast_alt.list[i].csite)) {
                 saved_csite = fast_alt.list[i].csite;
                 unlock_start_ns = butil::cpuwide_time_ns();
             }
             fast_alt.list[i] = fast_alt.list[--fast_alt.count];
             miss_in_tls = false;
             break;
         }
     }
 #endif
     // Check the map to see if the lock is sampled. Notice that we're still
     // inside critical section.
     if (miss_in_tls) {
         if (remove_pthread_contention_site(mutex, &saved_csite)) {
             unlock_start_ns = butil::cpuwide_time_ns();
         }
     }
     const int rc = sys_pthread_mutex_unlock(mutex);
     // [Outside lock]
     if (unlock_start_ns) {
         const int64_t unlock_end_ns = butil::cpuwide_time_ns();
         saved_csite.duration_ns += unlock_end_ns - unlock_start_ns;
         submit_contention(saved_csite, unlock_end_ns);
     }
     return rc;
 }

 // Implement bthread_mutex_t related functions
 struct MutexInternal {
     butil::static_atomic<unsigned char> locked;
     butil::static_atomic<unsigned char> contended;
     unsigned short padding;
 };

 const MutexInternal MUTEX_CONTENDED_RAW = {{1},{1},0};
 const MutexInternal MUTEX_LOCKED_RAW = {{1},{0},0};
 // Define as macros rather than constants which can't be put in read-only
 // section and affected by initialization-order fiasco.
 #define BTHREAD_MUTEX_CONTENDED (*(const unsigned*)&bthread::MUTEX_CONTENDED_RAW)
 #define BTHREAD_MUTEX_LOCKED (*(const unsigned*)&bthread::MUTEX_LOCKED_RAW)

 BAIDU_CASSERT(sizeof(unsigned) == sizeof(MutexInternal),
               sizeof_mutex_internal_must_equal_unsigned);

 inline int mutex_lock_contended(bthread_mutex_t* m) {
     butil::atomic<unsigned>* whole = (butil::atomic<unsigned>*)m->butex;
     while (whole->exchange(BTHREAD_MUTEX_CONTENDED) & BTHREAD_MUTEX_LOCKED) {
         if (bthread::butex_wait(whole, BTHREAD_MUTEX_CONTENDED, NULL) < 0 &&
             errno != EWOULDBLOCK && errno != EINTR/*note*/) {
             // a mutex lock should ignore interruptions in general since
             // user code is unlikely to check the return value.
             return errno;
         }
     }
     return 0;
 }

 inline int mutex_timedlock_contended(
     bthread_mutex_t* m, const struct timespec* __restrict abstime) {
     butil::atomic<unsigned>* whole = (butil::atomic<unsigned>*)m->butex;
     while (whole->exchange(BTHREAD_MUTEX_CONTENDED) & BTHREAD_MUTEX_LOCKED) {
         if (bthread::butex_wait(whole, BTHREAD_MUTEX_CONTENDED, abstime) < 0 &&
             errno != EWOULDBLOCK && errno != EINTR/*note*/) {
             // a mutex lock should ignore interrruptions in general since
             // user code is unlikely to check the return value.
             return errno;
         }
     }
     return 0;
 }

 #ifdef BTHREAD_USE_FAST_PTHREAD_MUTEX
 namespace internal {

 int FastPthreadMutex::lock_contended() {
     butil::atomic<unsigned>* whole = (butil::atomic<unsigned>*)&_futex;
     while (whole->exchange(BTHREAD_MUTEX_CONTENDED) & BTHREAD_MUTEX_LOCKED) {
         if (futex_wait_private(whole, BTHREAD_MUTEX_CONTENDED, NULL) < 0
             && errno != EWOULDBLOCK) {
             return errno;
         }
     }
     return 0;
 }

 void FastPthreadMutex::lock() {
     bthread::MutexInternal* split = (bthread::MutexInternal*)&_futex;
     if (split->locked.exchange(1, butil::memory_order_acquire)) {
         (void)lock_contended();
     }
 }

 bool FastPthreadMutex::try_lock() {
     bthread::MutexInternal* split = (bthread::MutexInternal*)&_futex;
     return !split->locked.exchange(1, butil::memory_order_acquire);
 }

 void FastPthreadMutex::unlock() {
     butil::atomic<unsigned>* whole = (butil::atomic<unsigned>*)&_futex;
     const unsigned prev = whole->exchange(0, butil::memory_order_release);
     // CAUTION: the mutex may be destroyed, check comments before butex_create
     if (prev != BTHREAD_MUTEX_LOCKED) {
         futex_wake_private(whole, 1);
     }
 }

 } // namespace internal
 #endif // BTHREAD_USE_FAST_PTHREAD_MUTEX

 } // namespace bthread

 extern "C" {

 int bthread_mutex_init(bthread_mutex_t* __restrict m,
                        const bthread_mutexattr_t* __restrict) {
     bthread::make_contention_site_invalid(&m->csite);
     m->butex = bthread::butex_create_checked<unsigned>();
     if (!m->butex) {
         return ENOMEM;
     }
     *m->butex = 0;
     return 0;
 }

 int bthread_mutex_destroy(bthread_mutex_t* m) {
     bthread::butex_destroy(m->butex);
     return 0;
 }

 int bthread_mutex_trylock(bthread_mutex_t* m) {
     bthread::MutexInternal* split = (bthread::MutexInternal*)m->butex;
     if (!split->locked.exchange(1, butil::memory_order_acquire)) {
         return 0;
     }
     return EBUSY;
 }

 int bthread_mutex_lock_contended(bthread_mutex_t* m) {
     return bthread::mutex_lock_contended(m);
 }

 int bthread_mutex_lock(bthread_mutex_t* m) {
     bthread::MutexInternal* split = (bthread::MutexInternal*)m->butex;
     if (!split->locked.exchange(1, butil::memory_order_acquire)) {
         return 0;
     }
     // Don't sample when contention profiler is off.
     if (!bthread::g_cp) {
         return bthread::mutex_lock_contended(m);
     }
     // Ask Collector if this (contended) locking should be sampled.
     const size_t sampling_range = bvar::is_collectable(&bthread::g_cp_sl);
     if (!sampling_range) { // Don't sample
         return bthread::mutex_lock_contended(m);
     }
     // Start sampling.
     const int64_t start_ns = butil::cpuwide_time_ns();
     // NOTE: Don't modify m->csite outside lock since multiple threads are
     // still contending with each other.
     const int rc = bthread::mutex_lock_contended(m);
     if (!rc) { // Inside lock
         m->csite.duration_ns = butil::cpuwide_time_ns() - start_ns;
         m->csite.sampling_range = sampling_range;
     } // else rare
     return rc;
 }

 int bthread_mutex_timedlock(bthread_mutex_t* __restrict m,
                             const struct timespec* __restrict abstime) {
     bthread::MutexInternal* split = (bthread::MutexInternal*)m->butex;
     if (!split->locked.exchange(1, butil::memory_order_acquire)) {
         return 0;
     }
     // Don't sample when contention profiler is off.
     if (!bthread::g_cp) {
         return bthread::mutex_timedlock_contended(m, abstime);
     }
     // Ask Collector if this (contended) locking should be sampled.
     const size_t sampling_range = bvar::is_collectable(&bthread::g_cp_sl);
     if (!sampling_range) { // Don't sample
         return bthread::mutex_timedlock_contended(m, abstime);
     }
     // Start sampling.
     const int64_t start_ns = butil::cpuwide_time_ns();
     // NOTE: Don't modify m->csite outside lock since multiple threads are
     // still contending with each other.
     const int rc = bthread::mutex_timedlock_contended(m, abstime);
     if (!rc) { // Inside lock
         m->csite.duration_ns = butil::cpuwide_time_ns() - start_ns;
         m->csite.sampling_range = sampling_range;
     } else if (rc == ETIMEDOUT) {
         // Failed to lock due to ETIMEDOUT, submit the elapse directly.
         const int64_t end_ns = butil::cpuwide_time_ns();
         const bthread_contention_site_t csite = {end_ns - start_ns, sampling_range};
         bthread::submit_contention(csite, end_ns);
     }
     return rc;
 }

 int bthread_mutex_unlock(bthread_mutex_t* m) {
     butil::atomic<unsigned>* whole = (butil::atomic<unsigned>*)m->butex;
     bthread_contention_site_t saved_csite = {0, 0};
     if (bthread::is_contention_site_valid(m->csite)) {
         saved_csite = m->csite;
         bthread::make_contention_site_invalid(&m->csite);
     }
     const unsigned prev = whole->exchange(0, butil::memory_order_release);
     // CAUTION: the mutex may be destroyed, check comments before butex_create
     if (prev == BTHREAD_MUTEX_LOCKED) {
         return 0;
     }
     // Wakeup one waiter
     if (!bthread::is_contention_site_valid(saved_csite)) {
         bthread::butex_wake(whole);
         return 0;
     }
     const int64_t unlock_start_ns = butil::cpuwide_time_ns();
     bthread::butex_wake(whole);
     const int64_t unlock_end_ns = butil::cpuwide_time_ns();
     saved_csite.duration_ns += unlock_end_ns - unlock_start_ns;
     bthread::submit_contention(saved_csite, unlock_end_ns);
     return 0;
 }

 int pthread_mutex_lock (pthread_mutex_t *__mutex) {
     return bthread::pthread_mutex_lock_impl(__mutex);
 }
 int pthread_mutex_unlock (pthread_mutex_t *__mutex) {
     return bthread::pthread_mutex_unlock_impl(__mutex);
 }

 }  // extern "C"