blob: 5361951a9932aa04a94cbb345c60635493410cef [file] [log] [blame]
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "util/thread_local.h"
#include "util/mutexlock.h"
#include "port/likely.h"
#include <stdlib.h>
namespace rocksdb {
struct Entry {
Entry() : ptr(nullptr) {}
Entry(const Entry& e) : ptr(e.ptr.load(std::memory_order_relaxed)) {}
std::atomic<void*> ptr;
class StaticMeta;
// This is the structure that is declared as "thread_local" storage.
// The vector keep list of atomic pointer for all instances for "current"
// thread. The vector is indexed by an Id that is unique in process and
// associated with one ThreadLocalPtr instance. The Id is assigned by a
// global StaticMeta singleton. So if we instantiated 3 ThreadLocalPtr
// instances, each thread will have a ThreadData with a vector of size 3:
// ---------------------------------------------------
// | | instance 1 | instance 2 | instnace 3 |
// ---------------------------------------------------
// | thread 1 | void* | void* | void* | <- ThreadData
// ---------------------------------------------------
// | thread 2 | void* | void* | void* | <- ThreadData
// ---------------------------------------------------
// | thread 3 | void* | void* | void* | <- ThreadData
// ---------------------------------------------------
struct ThreadData {
explicit ThreadData(ThreadLocalPtr::StaticMeta* _inst) : entries(), inst(_inst) {}
std::vector<Entry> entries;
ThreadData* next;
ThreadData* prev;
ThreadLocalPtr::StaticMeta* inst;
class ThreadLocalPtr::StaticMeta {
// Return the next available Id
uint32_t GetId();
// Return the next available Id without claiming it
uint32_t PeekId() const;
// Return the given Id back to the free pool. This also triggers
// UnrefHandler for associated pointer value (if not NULL) for all threads.
void ReclaimId(uint32_t id);
// Return the pointer value for the given id for the current thread.
void* Get(uint32_t id) const;
// Reset the pointer value for the given id for the current thread.
void Reset(uint32_t id, void* ptr);
// Atomically swap the supplied ptr and return the previous value
void* Swap(uint32_t id, void* ptr);
// Atomically compare and swap the provided value only if it equals
// to expected value.
bool CompareAndSwap(uint32_t id, void* ptr, void*& expected);
// Reset all thread local data to replacement, and return non-nullptr
// data for all existing threads
void Scrape(uint32_t id, autovector<void*>* ptrs, void* const replacement);
// Update res by applying func on each thread-local value. Holds a lock that
// prevents unref handler from running during this call, but clients must
// still provide external synchronization since the owning thread can
// access the values without internal locking, e.g., via Get() and Reset().
void Fold(uint32_t id, FoldFunc func, void* res);
// Register the UnrefHandler for id
void SetHandler(uint32_t id, UnrefHandler handler);
// protect inst, next_instance_id_, free_instance_ids_, head_,
// ThreadData.entries
// Note that here we prefer function static variable instead of the usual
// global static variable. The reason is that c++ destruction order of
// static variables in the reverse order of their construction order.
// However, C++ does not guarantee any construction order when global
// static variables are defined in different files, while the function
// static variables are initialized when their function are first called.
// As a result, the construction order of the function static variables
// can be controlled by properly invoke their first function calls in
// the right order.
// For instance, the following function contains a function static
// variable. We place a dummy function call of this inside
// Env::Default() to ensure the construction order of the construction
// order.
static port::Mutex* Mutex();
// Returns the member mutex of the current StaticMeta. In general,
// Mutex() should be used instead of this one. However, in case where
// the static variable inside Instance() goes out of scope, MemberMutex()
// should be used. One example is OnThreadExit() function.
port::Mutex* MemberMutex() { return &mutex_; }
// Get UnrefHandler for id with acquiring mutex
// REQUIRES: mutex locked
UnrefHandler GetHandler(uint32_t id);
// Triggered before a thread terminates
static void OnThreadExit(void* ptr);
// Add current thread's ThreadData to the global chain
// REQUIRES: mutex locked
void AddThreadData(ThreadData* d);
// Remove current thread's ThreadData from the global chain
// REQUIRES: mutex locked
void RemoveThreadData(ThreadData* d);
static ThreadData* GetThreadLocal();
uint32_t next_instance_id_;
// Used to recycle Ids in case ThreadLocalPtr is instantiated and destroyed
// frequently. This also prevents it from blowing up the vector space.
autovector<uint32_t> free_instance_ids_;
// Chain all thread local structure together. This is necessary since
// when one ThreadLocalPtr gets destroyed, we need to loop over each
// thread's version of pointer corresponding to that instance and
// call UnrefHandler for it.
ThreadData head_;
std::unordered_map<uint32_t, UnrefHandler> handler_map_;
// The private mutex. Developers should always use Mutex() instead of
// using this variable directly.
port::Mutex mutex_;
// Thread local storage
static __thread ThreadData* tls_;
// Used to make thread exit trigger possible if !defined(OS_MACOSX).
// Otherwise, used to retrieve thread data.
pthread_key_t pthread_key_;
__thread ThreadData* ThreadLocalPtr::StaticMeta::tls_ = nullptr;
// Windows doesn't support a per-thread destructor with its
// TLS primitives. So, we build it manually by inserting a
// function to be called on each thread's exit.
// See
// and
// really we do this to have clear conscience since using TLS with thread-pools
// is iffy
// although OK within a request. But otherwise, threads have no identity in its
// modern use.
// This runs on windows only called from the System Loader
#ifdef OS_WIN
// Windows cleanup routine is invoked from a System Loader with a different
// signature so we can not directly hookup the original OnThreadExit which is
// private member
// so we make StaticMeta class share with the us the address of the function so
// we can invoke it.
namespace wintlscleanup {
// This is set to OnThreadExit in StaticMeta singleton constructor
UnrefHandler thread_local_inclass_routine = nullptr;
pthread_key_t thread_local_key = -1;
// Static callback function to call with each thread termination.
void NTAPI WinOnThreadExit(PVOID module, DWORD reason, PVOID reserved) {
// We decided to punt on PROCESS_EXIT
if (DLL_THREAD_DETACH == reason) {
if (thread_local_key != pthread_key_t(-1) && thread_local_inclass_routine != nullptr) {
void* tls = pthread_getspecific(thread_local_key);
if (tls != nullptr) {
} // wintlscleanup
// extern "C" suppresses C++ name mangling so we know the symbol name for the
// linker /INCLUDE:symbol pragma above.
extern "C" {
#ifdef _MSC_VER
// The linker must not discard thread_callback_on_exit. (We force a reference
// to this variable with a linker /include:symbol pragma to ensure that.) If
// this variable is discarded, the OnThreadExit function will never be called.
#ifdef _WIN64
// .CRT section is merged with .rdata on x64 so it must be constant data.
#pragma const_seg(".CRT$XLB")
// When defining a const variable, it must have external linkage to be sure the
// linker doesn't discard it.
extern const PIMAGE_TLS_CALLBACK p_thread_callback_on_exit;
const PIMAGE_TLS_CALLBACK p_thread_callback_on_exit =
// Reset the default section.
#pragma const_seg()
#pragma comment(linker, "/include:_tls_used")
#pragma comment(linker, "/include:p_thread_callback_on_exit")
#else // _WIN64
#pragma data_seg(".CRT$XLB")
PIMAGE_TLS_CALLBACK p_thread_callback_on_exit = wintlscleanup::WinOnThreadExit;
// Reset the default section.
#pragma data_seg()
#pragma comment(linker, "/INCLUDE:__tls_used")
#pragma comment(linker, "/INCLUDE:_p_thread_callback_on_exit")
#endif // _WIN64
if (dwReason == DLL_THREAD_DETACH)
wintlscleanup::WinOnThreadExit(h, dwReason, pv);
return TRUE;
} // extern "C"
#endif // OS_WIN
void ThreadLocalPtr::InitSingletons() { ThreadLocalPtr::Instance(); }
ThreadLocalPtr::StaticMeta* ThreadLocalPtr::Instance() {
// Here we prefer function static variable instead of global
// static variable as function static variable is initialized
// when the function is first call. As a result, we can properly
// control their construction order by properly preparing their
// first function call.
// Note that here we decide to make "inst" a static pointer w/o deleting
// it at the end instead of a static variable. This is to avoid the following
// destruction order disaster happens when a child thread using ThreadLocalPtr
// dies AFTER the main thread dies: When a child thread happens to use
// ThreadLocalPtr, it will try to delete its thread-local data on its
// OnThreadExit when the child thread dies. However, OnThreadExit depends
// on the following variable. As a result, if the main thread dies before any
// child thread happen to use ThreadLocalPtr dies, then the destruction of
// the following variable will go first, then OnThreadExit, therefore causing
// invalid access.
// The above problem can be solved by using thread_local to store tls_ instead
// of using __thread. The major difference between thread_local and __thread
// is that thread_local supports dynamic construction and destruction of
// non-primitive typed variables. As a result, we can guarantee the
// destruction order even when the main thread dies before any child threads.
// However, thread_local is not supported in all compilers that accept -std=c++11
// (e.g., eg Mac with XCode < 8. XCode 8+ supports thread_local).
static ThreadLocalPtr::StaticMeta* inst = new ThreadLocalPtr::StaticMeta();
return inst;
port::Mutex* ThreadLocalPtr::StaticMeta::Mutex() { return &Instance()->mutex_; }
void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) {
auto* tls = static_cast<ThreadData*>(ptr);
assert(tls != nullptr);
// Use the cached StaticMeta::Instance() instead of directly calling
// the variable inside StaticMeta::Instance() might already go out of
// scope here in case this OnThreadExit is called after the main thread
// dies.
auto* inst = tls->inst;
pthread_setspecific(inst->pthread_key_, nullptr);
MutexLock l(inst->MemberMutex());
// Unref stored pointers of current thread from all instances
uint32_t id = 0;
for (auto& e : tls->entries) {
void* raw = e.ptr.load();
if (raw != nullptr) {
auto unref = inst->GetHandler(id);
if (unref != nullptr) {
// Delete thread local structure no matter if it is Mac platform
delete tls;
ThreadLocalPtr::StaticMeta::StaticMeta() : next_instance_id_(0), head_(this) {
if (pthread_key_create(&pthread_key_, &OnThreadExit) != 0) {
// OnThreadExit is not getting called on the main thread.
// Call through the static destructor mechanism to avoid memory leak.
// Caveats: ~A() will be invoked _after_ ~StaticMeta for the global
// singleton (destructors are invoked in reverse order of constructor
// _completion_); the latter must not mutate internal members. This
// cleanup mechanism inherently relies on use-after-release of the
// StaticMeta, and is brittle with respect to compiler-specific handling
// of memory backing destructed statically-scoped objects. Perhaps
// registering with atexit(3) would be more robust.
// This is not required on Windows.
#if !defined(OS_WIN)
static struct A {
~A() {
ThreadData* tls_ =
if (tls_) {
} a;
#endif // !defined(OS_WIN) = &head_;
head_.prev = &head_;
#ifdef OS_WIN
// Share with Windows its cleanup routine and the key
wintlscleanup::thread_local_inclass_routine = OnThreadExit;
wintlscleanup::thread_local_key = pthread_key_;
void ThreadLocalPtr::StaticMeta::AddThreadData(ThreadData* d) {
d->next = &head_;
d->prev = head_.prev;
head_.prev->next = d;
head_.prev = d;
void ThreadLocalPtr::StaticMeta::RemoveThreadData(
ThreadData* d) {
d->next->prev = d->prev;
d->prev->next = d->next;
d->next = d->prev = d;
ThreadData* ThreadLocalPtr::StaticMeta::GetThreadLocal() {
// Make this local variable name look like a member variable so that we
// can share all the code below
ThreadData* tls_ =
if (UNLIKELY(tls_ == nullptr)) {
auto* inst = Instance();
tls_ = new ThreadData(inst);
// Register it in the global chain, needs to be done before thread exit
// handler registration
MutexLock l(Mutex());
// Even it is not OS_MACOSX, need to register value for pthread_key_ so that
// its exit handler will be triggered.
if (pthread_setspecific(inst->pthread_key_, tls_) != 0) {
MutexLock l(Mutex());
delete tls_;
return tls_;
void* ThreadLocalPtr::StaticMeta::Get(uint32_t id) const {
auto* tls = GetThreadLocal();
if (UNLIKELY(id >= tls->entries.size())) {
return nullptr;
return tls->entries[id].ptr.load(std::memory_order_acquire);
void ThreadLocalPtr::StaticMeta::Reset(uint32_t id, void* ptr) {
auto* tls = GetThreadLocal();
if (UNLIKELY(id >= tls->entries.size())) {
// Need mutex to protect entries access within ReclaimId
MutexLock l(Mutex());
tls->entries.resize(id + 1);
tls->entries[id], std::memory_order_release);
void* ThreadLocalPtr::StaticMeta::Swap(uint32_t id, void* ptr) {
auto* tls = GetThreadLocal();
if (UNLIKELY(id >= tls->entries.size())) {
// Need mutex to protect entries access within ReclaimId
MutexLock l(Mutex());
tls->entries.resize(id + 1);
return tls->entries[id], std::memory_order_acquire);
bool ThreadLocalPtr::StaticMeta::CompareAndSwap(uint32_t id, void* ptr,
void*& expected) {
auto* tls = GetThreadLocal();
if (UNLIKELY(id >= tls->entries.size())) {
// Need mutex to protect entries access within ReclaimId
MutexLock l(Mutex());
tls->entries.resize(id + 1);
return tls->entries[id].ptr.compare_exchange_strong(
expected, ptr, std::memory_order_release, std::memory_order_relaxed);
void ThreadLocalPtr::StaticMeta::Scrape(uint32_t id, autovector<void*>* ptrs,
void* const replacement) {
MutexLock l(Mutex());
for (ThreadData* t =; t != &head_; t = t->next) {
if (id < t->entries.size()) {
void* ptr =
t->entries[id], std::memory_order_acquire);
if (ptr != nullptr) {
void ThreadLocalPtr::StaticMeta::Fold(uint32_t id, FoldFunc func, void* res) {
MutexLock l(Mutex());
for (ThreadData* t =; t != &head_; t = t->next) {
if (id < t->entries.size()) {
void* ptr = t->entries[id].ptr.load();
if (ptr != nullptr) {
func(ptr, res);
uint32_t ThreadLocalPtr::TEST_PeekId() {
return Instance()->PeekId();
void ThreadLocalPtr::StaticMeta::SetHandler(uint32_t id, UnrefHandler handler) {
MutexLock l(Mutex());
handler_map_[id] = handler;
UnrefHandler ThreadLocalPtr::StaticMeta::GetHandler(uint32_t id) {
auto iter = handler_map_.find(id);
if (iter == handler_map_.end()) {
return nullptr;
return iter->second;
uint32_t ThreadLocalPtr::StaticMeta::GetId() {
MutexLock l(Mutex());
if (free_instance_ids_.empty()) {
return next_instance_id_++;
uint32_t id = free_instance_ids_.back();
return id;
uint32_t ThreadLocalPtr::StaticMeta::PeekId() const {
MutexLock l(Mutex());
if (!free_instance_ids_.empty()) {
return free_instance_ids_.back();
return next_instance_id_;
void ThreadLocalPtr::StaticMeta::ReclaimId(uint32_t id) {
// This id is not used, go through all thread local data and release
// corresponding value
MutexLock l(Mutex());
auto unref = GetHandler(id);
for (ThreadData* t =; t != &head_; t = t->next) {
if (id < t->entries.size()) {
void* ptr = t->entries[id];
if (ptr != nullptr && unref != nullptr) {
handler_map_[id] = nullptr;
ThreadLocalPtr::ThreadLocalPtr(UnrefHandler handler)
: id_(Instance()->GetId()) {
if (handler != nullptr) {
Instance()->SetHandler(id_, handler);
ThreadLocalPtr::~ThreadLocalPtr() {
void* ThreadLocalPtr::Get() const {
return Instance()->Get(id_);
void ThreadLocalPtr::Reset(void* ptr) {
Instance()->Reset(id_, ptr);
void* ThreadLocalPtr::Swap(void* ptr) {
return Instance()->Swap(id_, ptr);
bool ThreadLocalPtr::CompareAndSwap(void* ptr, void*& expected) {
return Instance()->CompareAndSwap(id_, ptr, expected);
void ThreadLocalPtr::Scrape(autovector<void*>* ptrs, void* const replacement) {
Instance()->Scrape(id_, ptrs, replacement);
void ThreadLocalPtr::Fold(FoldFunc func, void* res) {
Instance()->Fold(id_, func, res);
} // namespace rocksdb