be/src/util/thread.cpp - doris - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
 // This file is copied from
 // https://github.com/apache/impala/blob/branch-2.9.0/be/src/util/thread.cc
 // and modified by Doris

 #include "thread.h"

 #include <sys/resource.h>

 #ifndef __APPLE__
 // IWYU pragma: no_include <bits/types/struct_sched_param.h>
 #include <sched.h>
 #include <sys/prctl.h>
 #else
 #include <pthread.h>

 #include <cstdint>
 #endif

 // IWYU pragma: no_include <bthread/errno.h>
 #include <errno.h> // IWYU pragma: keep
 #include <sys/syscall.h>
 #include <time.h>
 #include <unistd.h>

 #include <algorithm>
 // IWYU pragma: no_include <bits/chrono.h>
 #include <chrono> // IWYU pragma: keep
 #include <cstring>
 #include <functional>
 #include <limits>
 #include <map>
 #include <memory>
 #include <mutex>
 #include <ostream>
 #include <string>
 #include <vector>

 #include "common/config.h"
 #include "common/logging.h"
 #include "gutil/atomicops.h"
 #include "gutil/dynamic_annotations.h"
 #include "gutil/map-util.h"
 #include "gutil/stringprintf.h"
 #include "gutil/strings/substitute.h"
 #include "http/web_page_handler.h"
 #include "runtime/thread_context.h"
 #include "util/debug/sanitizer_scopes.h"
 #include "util/easy_json.h"
 #include "util/os_util.h"
 #include "util/scoped_cleanup.h"
 #include "util/url_coding.h"

 namespace doris {

 class ThreadMgr;

 __thread Thread* Thread::_tls = nullptr;

 // Singleton instance of ThreadMgr. Only visible in this file, used only by Thread.
 // // The Thread class adds a reference to thread_manager while it is supervising a thread so
 // // that a race between the end of the process's main thread (and therefore the destruction
 // // of thread_manager) and the end of a thread that tries to remove itself from the
 // // manager after the destruction can be avoided.
 static std::shared_ptr<ThreadMgr> thread_manager;
 //
 // Controls the single (lazy) initialization of thread_manager.
 static std::once_flag once;

 // A singleton class that tracks all live threads, and groups them together for easy
 // auditing. Used only by Thread.
 class ThreadMgr {
 public:
     ThreadMgr() : _threads_started_metric(0), _threads_running_metric(0) {}

     ~ThreadMgr() {
         std::unique_lock<std::mutex> lock(_lock);
         _thread_categories.clear();
     }

     static void set_thread_name(const std::string& name, int64_t tid);

 #ifndef __APPLE__
     static void set_idle_sched(int64_t tid);

     static void set_thread_nice_value(int64_t tid);
 #endif

     // not the system TID, since pthread_t is less prone to being recycled.
     void add_thread(const pthread_t& pthread_id, const std::string& name,
                     const std::string& category, int64_t tid);

     // Removes a thread from the supplied category. If the thread has
     // already been removed, this is a no-op.
     void remove_thread(const pthread_t& pthread_id, const std::string& category);

     void display_thread_callback(const WebPageHandler::ArgumentMap& args, EasyJson* ej) const;

 private:
     // Container class for any details we want to capture about a thread
     // TODO: Add start-time.
     // TODO: Track fragment ID.
     class ThreadDescriptor {
     public:
         ThreadDescriptor() {}
         ThreadDescriptor(std::string category, std::string name, int64_t thread_id)
                 : _name(std::move(name)), _category(std::move(category)), _thread_id(thread_id) {}

         const std::string& name() const { return _name; }
         const std::string& category() const { return _category; }
         int64_t thread_id() const { return _thread_id; }

     private:
         std::string _name;
         std::string _category;
         int64_t _thread_id;
     };

     void summarize_thread_descriptor(const ThreadDescriptor& desc, EasyJson* ej) const;

     // A ThreadCategory is a set of threads that are logically related.
     // TODO: unordered_map is incompatible with pthread_t, but would be more
     // efficient here.
     typedef std::map<const pthread_t, ThreadDescriptor> ThreadCategory;

     // All thread categories, keyed on the category name.
     typedef std::map<std::string, ThreadCategory> ThreadCategoryMap;

     // Protects _thread_categories and thread metrics.
     mutable std::mutex _lock;

     // All thread categories that ever contained a thread, even if empty
     ThreadCategoryMap _thread_categories;

     // Counters to track all-time total number of threads, and the
     // current number of running threads.
     uint64_t _threads_started_metric;
     uint64_t _threads_running_metric;

     DISALLOW_COPY_AND_ASSIGN(ThreadMgr);
 };

 void ThreadMgr::set_thread_name(const std::string& name, int64_t tid) {
     if (tid == getpid()) {
         return;
     }
 #ifdef __APPLE__
     int err = pthread_setname_np(name.c_str());
 #else
     int err = prctl(PR_SET_NAME, name.c_str());
 #endif
     if (err < 0 && errno != EPERM) {
         LOG(ERROR) << "set_thread_name";
     }
 }

 #ifndef __APPLE__
 void ThreadMgr::set_idle_sched(int64_t tid) {
     if (tid == getpid()) {
         return;
     }
     struct sched_param sp = {.sched_priority = 0};
     int err = sched_setscheduler(0, SCHED_IDLE, &sp);
     if (err < 0 && errno != EPERM) {
         LOG(ERROR) << "set_thread_idle_sched";
     }
 }

 void ThreadMgr::set_thread_nice_value(int64_t tid) {
     if (tid == getpid()) {
         return;
     }
     // From Linux kernel:
     // In the current implementation, each unit of difference in the nice values of two
     // processes results in a factor of 1.25 in the degree to which the
     // scheduler favors the higher priority process.  This causes very
     // low nice values (+19) to truly provide little CPU to a process
     // whenever there is any other higher priority load on the system,
     // and makes high nice values (-20) deliver most of the CPU to
     // applications that require it (e.g., some audio applications).

     // Choose 5 as lower priority value, default is 0
     int err = setpriority(PRIO_PROCESS, 0, config::scan_thread_nice_value);
     if (err < 0 && errno != EPERM) {
         LOG(ERROR) << "set_thread_low_priority";
     }
 }
 #endif

 void ThreadMgr::add_thread(const pthread_t& pthread_id, const std::string& name,
                            const std::string& category, int64_t tid) {
     // These annotations cause TSAN to ignore the synchronization on lock_
     // without causing the subsequent mutations to be treated as data races
     // in and of themselves (that's what IGNORE_READS_AND_WRITES does).
     //
     // Why do we need them here and in SuperviseThread()? TSAN operates by
     // observing synchronization events and using them to establish "happens
     // before" relationships between threads. Where these relationships are
     // not built, shared state access constitutes a data race. The
     // synchronization events here, in RemoveThread(), and in
     // SuperviseThread() may cause TSAN to establish a "happens before"
     // relationship between thread functors, ignoring potential data races.
     // The annotations prevent this from happening.
     ANNOTATE_IGNORE_SYNC_BEGIN();
     debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan;
     {
         std::unique_lock<std::mutex> l(_lock);
         _thread_categories[category][pthread_id] = ThreadDescriptor(category, name, tid);
         _threads_running_metric++;
         _threads_started_metric++;
     }
     ANNOTATE_IGNORE_SYNC_END();
 }

 void ThreadMgr::remove_thread(const pthread_t& pthread_id, const std::string& category) {
     ANNOTATE_IGNORE_SYNC_BEGIN();
     debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan;
     {
         std::unique_lock<std::mutex> l(_lock);
         auto category_it = _thread_categories.find(category);
         DCHECK(category_it != _thread_categories.end());
         category_it->second.erase(pthread_id);
         _threads_running_metric--;
     }
     ANNOTATE_IGNORE_SYNC_END();
 }

 void ThreadMgr::display_thread_callback(const WebPageHandler::ArgumentMap& args,
                                         EasyJson* ej) const {
     const auto* category_name = FindOrNull(args, "group");
     if (category_name) {
         bool requested_all = (*category_name == "all");
         ej->Set("requested_thread_group", EasyJson::kObject);
         (*ej)["group_name"] = escape_for_html_to_string(*category_name);
         (*ej)["requested_all"] = requested_all;

         // The critical section is as short as possible so as to minimize the delay
         // imposed on new threads that acquire the lock in write mode.
         std::vector<ThreadDescriptor> descriptors_to_print;
         if (!requested_all) {
             std::unique_lock<std::mutex> l(_lock);
             const auto* category = FindOrNull(_thread_categories, *category_name);
             if (!category) {
                 return;
             }
             for (const auto& elem : *category) {
                 descriptors_to_print.emplace_back(elem.second);
             }
         } else {
             std::unique_lock<std::mutex> l(_lock);
             for (const auto& category : _thread_categories) {
                 for (const auto& elem : category.second) {
                     descriptors_to_print.emplace_back(elem.second);
                 }
             }
         }

         EasyJson found = (*ej).Set("found", EasyJson::kObject);
         EasyJson threads = found.Set("threads", EasyJson::kArray);
         for (const auto& desc : descriptors_to_print) {
             summarize_thread_descriptor(desc, &threads);
         }
     } else {
         // List all thread groups and the number of threads running in each.
         std::vector<std::pair<string, uint64_t>> thread_categories_info;
         uint64_t running;
         {
             std::unique_lock<std::mutex> l(_lock);
             running = _threads_running_metric;
             thread_categories_info.reserve(_thread_categories.size());
             for (const auto& category : _thread_categories) {
                 thread_categories_info.emplace_back(category.first, category.second.size());
             }

             (*ej)["total_threads_running"] = running;
             EasyJson groups = ej->Set("groups", EasyJson::kArray);
             for (const auto& elem : thread_categories_info) {
                 string category_arg;
                 url_encode(elem.first, &category_arg);
                 EasyJson group = groups.PushBack(EasyJson::kObject);
                 group["encoded_group_name"] = category_arg;
                 group["group_name"] = elem.first;
                 group["threads_running"] = elem.second;
             }
         }
     }
 }

 void ThreadMgr::summarize_thread_descriptor(const ThreadMgr::ThreadDescriptor& desc,
                                             EasyJson* ej) const {
     ThreadStats stats;
     Status status = get_thread_stats(desc.thread_id(), &stats);
     if (!status.ok()) {
         LOG(WARNING) << "Could not get per-thread statistics: " << status.to_string();
     }

     EasyJson thread = ej->PushBack(EasyJson::kObject);
     thread["thread_name"] = desc.name();
     thread["user_sec"] = static_cast<double>(stats.user_ns) / 1e9;
     thread["kernel_sec"] = static_cast<double>(stats.kernel_ns) / 1e9;
     thread["iowait_sec"] = static_cast<double>(stats.iowait_ns) / 1e9;
 }

 Thread::~Thread() {
     if (_joinable) {
         int ret = pthread_detach(_thread);
         CHECK_EQ(ret, 0);
     }
 }

 void Thread::set_self_name(const std::string& name) {
     ThreadMgr::set_thread_name(name, current_thread_id());
 }

 #ifndef __APPLE__
 void Thread::set_idle_sched() {
     ThreadMgr::set_idle_sched(current_thread_id());
 }

 void Thread::set_thread_nice_value() {
     ThreadMgr::set_thread_nice_value(current_thread_id());
 }
 #endif

 void Thread::join() {
     static_cast<void>(ThreadJoiner(this).join());
 }

 int64_t Thread::tid() const {
     int64_t t = base::subtle::Acquire_Load(&_tid);
     if (t != PARENT_WAITING_TID) {
         return _tid;
     }
     return wait_for_tid();
 }

 pthread_t Thread::pthread_id() const {
     return _thread;
 }

 const std::string& Thread::name() const {
     return _name;
 }

 const std::string& Thread::category() const {
     return _category;
 }

 std::string Thread::to_string() const {
     return strings::Substitute("Thread $0 (name: \"$1\", category: \"$2\")", tid(), _name,
                                _category);
 }

 Thread* Thread::current_thread() {
     return _tls;
 }

 int64_t Thread::unique_thread_id() {
 #ifdef __APPLE__
     uint64_t tid;
     pthread_threadid_np(pthread_self(), &tid);
     return tid;
 #else
     return static_cast<int64_t>(pthread_self());
 #endif
 }

 int64_t Thread::current_thread_id() {
 #ifdef __APPLE__
     uint64_t tid;
     pthread_threadid_np(nullptr, &tid);
     return tid;
 #else
     return syscall(SYS_gettid);
 #endif
 }

 int64_t Thread::wait_for_tid() const {
     int loop_count = 0;
     while (true) {
         int64_t t = Acquire_Load(&_tid);
         if (t != PARENT_WAITING_TID) {
             return t;
         }
         // copied from boost::detail::yield
         int k = loop_count++;
         if (k < 32 || k & 1) {
             sched_yield();
         } else {
             // g++ -Wextra warns on {} or {0}
             struct timespec rqtp = {0, 0};

             // POSIX says that timespec has tv_sec and tv_nsec
             // But it doesn't guarantee order or placement

             rqtp.tv_sec = 0;
             rqtp.tv_nsec = 1000;

             nanosleep(&rqtp, 0);
         }
     }
 }

 Status Thread::start_thread(const std::string& category, const std::string& name,
                             const ThreadFunctor& functor, uint64_t flags,
                             scoped_refptr<Thread>* holder) {
     std::call_once(once, init_threadmgr);

     // Temporary reference for the duration of this function.
     scoped_refptr<Thread> t(new Thread(category, name, functor));

     // Optional, and only set if the thread was successfully created.
     //
     // We have to set this before we even start the thread because it's
     // allowed for the thread functor to access 'holder'.
     if (holder) {
         *holder = t;
     }

     t->_tid = PARENT_WAITING_TID;

     // Add a reference count to the thread since SuperviseThread() needs to
     // access the thread object, and we have no guarantee that our caller
     // won't drop the reference as soon as we return. This is dereferenced
     // in FinishThread().
     t->AddRef();

     auto cleanup = MakeScopedCleanup([&]() {
         // If we failed to create the thread, we need to undo all of our prep work.
         t->_tid = INVALID_TID;
         t->Release();
     });

     int ret = pthread_create(&t->_thread, nullptr, &Thread::supervise_thread, t.get());
     if (ret) {
         return Status::RuntimeError("Could not create thread. (error {}) {}", ret, strerror(ret));
     }

     // The thread has been created and is now joinable.
     //
     // Why set this in the parent and not the child? Because only the parent
     // (or someone communicating with the parent) can join, so joinable must
     // be set before the parent returns.
     t->_joinable = true;
     cleanup.cancel();

     VLOG_NOTICE << "Started thread " << t->tid() << " - " << category << ":" << name;
     return Status::OK();
 }

 void* Thread::supervise_thread(void* arg) {
     Thread* t = static_cast<Thread*>(arg);
     int64_t system_tid = Thread::current_thread_id();
     PCHECK(system_tid != -1);

     // Take an additional reference to the thread manager, which we'll need below.
     ANNOTATE_IGNORE_SYNC_BEGIN();
     std::shared_ptr<ThreadMgr> thread_mgr_ref = thread_manager;
     ANNOTATE_IGNORE_SYNC_END();

     // Set up the TLS.
     //
     // We could store a scoped_refptr in the TLS itself, but as its
     // lifecycle is poorly defined, we'll use a bare pointer. We
     // already incremented the reference count in StartThread.
     Thread::_tls = t;

     // Create thread context, there is no need to create it when func is executed.
     ThreadLocalHandle::create_thread_local_if_not_exits();

     // Publish our tid to '_tid', which unblocks any callers waiting in
     // WaitForTid().
     Release_Store(&t->_tid, system_tid);

     std::string name = strings::Substitute("$0-$1", t->name(), system_tid);
     thread_manager->set_thread_name(name, t->_tid);
     thread_manager->add_thread(pthread_self(), name, t->category(), t->_tid);

     // FinishThread() is guaranteed to run (even if functor_ throws an
     // exception) because pthread_cleanup_push() creates a scoped object
     // whose destructor invokes the provided callback.
     pthread_cleanup_push(&Thread::finish_thread, t);
     t->_functor();
     pthread_cleanup_pop(true);

     return nullptr;
 }

 void Thread::finish_thread(void* arg) {
     Thread* t = static_cast<Thread*>(arg);

     // We're here either because of the explicit pthread_cleanup_pop() in
     // SuperviseThread() or through pthread_exit(). In either case,
     // thread_manager is guaranteed to be live because thread_mgr_ref in
     // SuperviseThread() is still live.
     thread_manager->remove_thread(pthread_self(), t->category());

     // Signal any Joiner that we're done.
     t->_done.count_down();

     VLOG_CRITICAL << "Ended thread " << t->_tid << " - " << t->category() << ":" << t->name();
     t->Release();
     // NOTE: the above 'Release' call could be the last reference to 'this',
     // so 'this' could be destructed at this point. Do not add any code
     // following here!

     ThreadLocalHandle::del_thread_local_if_count_is_zero();
 }

 void Thread::init_threadmgr() {
     thread_manager.reset(new ThreadMgr());
 }

 ThreadJoiner::ThreadJoiner(Thread* thr)
         : _thread(CHECK_NOTNULL(thr)),
           _warn_after_ms(kDefaultWarnAfterMs),
           _warn_every_ms(kDefaultWarnEveryMs),
           _give_up_after_ms(kDefaultGiveUpAfterMs) {}

 ThreadJoiner& ThreadJoiner::warn_after_ms(int ms) {
     _warn_after_ms = ms;
     return *this;
 }

 ThreadJoiner& ThreadJoiner::warn_every_ms(int ms) {
     _warn_every_ms = ms;
     return *this;
 }

 ThreadJoiner& ThreadJoiner::give_up_after_ms(int ms) {
     _give_up_after_ms = ms;
     return *this;
 }

 Status ThreadJoiner::join() {
     if (Thread::current_thread() && Thread::current_thread()->tid() == _thread->tid()) {
         return Status::InvalidArgument("Can't join on own thread. (error {}) {}", -1,
                                        _thread->_name);
     }

     // Early exit: double join is a no-op.
     if (!_thread->_joinable) {
         return Status::OK();
     }

     int waited_ms = 0;
     bool keep_trying = true;
     while (keep_trying) {
         if (waited_ms >= _warn_after_ms) {
             LOG(WARNING) << strings::Substitute("Waited for $0ms trying to join with $1 (tid $2)",
                                                 waited_ms, _thread->_name, _thread->_tid);
         }

         int remaining_before_giveup = std::numeric_limits<int>::max();
         if (_give_up_after_ms != -1) {
             remaining_before_giveup = _give_up_after_ms - waited_ms;
         }

         int remaining_before_next_warn = _warn_every_ms;
         if (waited_ms < _warn_after_ms) {
             remaining_before_next_warn = _warn_after_ms - waited_ms;
         }

         if (remaining_before_giveup < remaining_before_next_warn) {
             keep_trying = false;
         }

         int wait_for = std::min(remaining_before_giveup, remaining_before_next_warn);

         if (_thread->_done.wait_for(std::chrono::milliseconds(wait_for))) {
             // Unconditionally join before returning, to guarantee that any TLS
             // has been destroyed (pthread_key_create() destructors only run
             // after a pthread's user method has returned).
             int ret = pthread_join(_thread->_thread, nullptr);
             CHECK_EQ(ret, 0);
             _thread->_joinable = false;
             return Status::OK();
         }
         waited_ms += wait_for;
     }
     return Status::Aborted("Timed out after {}ms joining on {}", waited_ms, _thread->_name);
 }

 void register_thread_display_page(WebPageHandler* web_page_handler) {
     web_page_handler->register_template_page(
             "/threadz", "Threads",
             std::bind(&ThreadMgr::display_thread_callback, thread_manager.get(),
                       std::placeholders::_1, std::placeholders::_2),
             true);
 }
 } // namespace doris
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.
	// This file is copied from
	// https://github.com/apache/impala/blob/branch-2.9.0/be/src/util/thread.cc
	// and modified by Doris

	#include "thread.h"

	#include <sys/resource.h>

	#ifndef __APPLE__
	// IWYU pragma: no_include <bits/types/struct_sched_param.h>
	#include <sched.h>
	#include <sys/prctl.h>
	#else
	#include <pthread.h>

	#include <cstdint>
	#endif

	// IWYU pragma: no_include <bthread/errno.h>
	#include <errno.h> // IWYU pragma: keep
	#include <sys/syscall.h>
	#include <time.h>
	#include <unistd.h>

	#include <algorithm>
	// IWYU pragma: no_include <bits/chrono.h>
	#include <chrono> // IWYU pragma: keep
	#include <cstring>
	#include <functional>
	#include <limits>
	#include <map>
	#include <memory>
	#include <mutex>
	#include <ostream>
	#include <string>
	#include <vector>

	#include "common/config.h"
	#include "common/logging.h"
	#include "gutil/atomicops.h"
	#include "gutil/dynamic_annotations.h"
	#include "gutil/map-util.h"
	#include "gutil/stringprintf.h"
	#include "gutil/strings/substitute.h"
	#include "http/web_page_handler.h"
	#include "runtime/thread_context.h"
	#include "util/debug/sanitizer_scopes.h"
	#include "util/easy_json.h"
	#include "util/os_util.h"
	#include "util/scoped_cleanup.h"
	#include "util/url_coding.h"

	namespace doris {

	class ThreadMgr;

	__thread Thread* Thread::_tls = nullptr;

	// Singleton instance of ThreadMgr. Only visible in this file, used only by Thread.
	// // The Thread class adds a reference to thread_manager while it is supervising a thread so
	// // that a race between the end of the process's main thread (and therefore the destruction
	// // of thread_manager) and the end of a thread that tries to remove itself from the
	// // manager after the destruction can be avoided.
	static std::shared_ptr<ThreadMgr> thread_manager;
	//
	// Controls the single (lazy) initialization of thread_manager.
	static std::once_flag once;

	// A singleton class that tracks all live threads, and groups them together for easy
	// auditing. Used only by Thread.
	class ThreadMgr {
	public:
	ThreadMgr() : _threads_started_metric(0), _threads_running_metric(0) {}

	~ThreadMgr() {
	std::unique_lock<std::mutex> lock(_lock);
	_thread_categories.clear();
	}

	static void set_thread_name(const std::string& name, int64_t tid);

	#ifndef __APPLE__
	static void set_idle_sched(int64_t tid);

	static void set_thread_nice_value(int64_t tid);
	#endif

	// not the system TID, since pthread_t is less prone to being recycled.
	void add_thread(const pthread_t& pthread_id, const std::string& name,
	const std::string& category, int64_t tid);

	// Removes a thread from the supplied category. If the thread has
	// already been removed, this is a no-op.
	void remove_thread(const pthread_t& pthread_id, const std::string& category);

	void display_thread_callback(const WebPageHandler::ArgumentMap& args, EasyJson* ej) const;

	private:
	// Container class for any details we want to capture about a thread
	// TODO: Add start-time.
	// TODO: Track fragment ID.
	class ThreadDescriptor {
	public:
	ThreadDescriptor() {}
	ThreadDescriptor(std::string category, std::string name, int64_t thread_id)
	: _name(std::move(name)), _category(std::move(category)), _thread_id(thread_id) {}

	const std::string& name() const { return _name; }
	const std::string& category() const { return _category; }
	int64_t thread_id() const { return _thread_id; }

	private:
	std::string _name;
	std::string _category;
	int64_t _thread_id;
	};

	void summarize_thread_descriptor(const ThreadDescriptor& desc, EasyJson* ej) const;

	// A ThreadCategory is a set of threads that are logically related.
	// TODO: unordered_map is incompatible with pthread_t, but would be more
	// efficient here.
	typedef std::map<const pthread_t, ThreadDescriptor> ThreadCategory;

	// All thread categories, keyed on the category name.
	typedef std::map<std::string, ThreadCategory> ThreadCategoryMap;

	// Protects _thread_categories and thread metrics.
	mutable std::mutex _lock;

	// All thread categories that ever contained a thread, even if empty
	ThreadCategoryMap _thread_categories;

	// Counters to track all-time total number of threads, and the
	// current number of running threads.
	uint64_t _threads_started_metric;
	uint64_t _threads_running_metric;

	DISALLOW_COPY_AND_ASSIGN(ThreadMgr);
	};

	void ThreadMgr::set_thread_name(const std::string& name, int64_t tid) {
	if (tid == getpid()) {
	return;
	}
	#ifdef __APPLE__
	int err = pthread_setname_np(name.c_str());
	#else
	int err = prctl(PR_SET_NAME, name.c_str());
	#endif
	if (err < 0 && errno != EPERM) {
	LOG(ERROR) << "set_thread_name";
	}
	}

	#ifndef __APPLE__
	void ThreadMgr::set_idle_sched(int64_t tid) {
	if (tid == getpid()) {
	return;
	}
	struct sched_param sp = {.sched_priority = 0};
	int err = sched_setscheduler(0, SCHED_IDLE, &sp);
	if (err < 0 && errno != EPERM) {
	LOG(ERROR) << "set_thread_idle_sched";
	}
	}

	void ThreadMgr::set_thread_nice_value(int64_t tid) {
	if (tid == getpid()) {
	return;
	}
	// From Linux kernel:
	// In the current implementation, each unit of difference in the nice values of two
	// processes results in a factor of 1.25 in the degree to which the
	// scheduler favors the higher priority process. This causes very
	// low nice values (+19) to truly provide little CPU to a process
	// whenever there is any other higher priority load on the system,
	// and makes high nice values (-20) deliver most of the CPU to
	// applications that require it (e.g., some audio applications).

	// Choose 5 as lower priority value, default is 0
	int err = setpriority(PRIO_PROCESS, 0, config::scan_thread_nice_value);
	if (err < 0 && errno != EPERM) {
	LOG(ERROR) << "set_thread_low_priority";
	}
	}
	#endif

	void ThreadMgr::add_thread(const pthread_t& pthread_id, const std::string& name,
	const std::string& category, int64_t tid) {
	// These annotations cause TSAN to ignore the synchronization on lock_
	// without causing the subsequent mutations to be treated as data races
	// in and of themselves (that's what IGNORE_READS_AND_WRITES does).
	//
	// Why do we need them here and in SuperviseThread()? TSAN operates by
	// observing synchronization events and using them to establish "happens
	// before" relationships between threads. Where these relationships are
	// not built, shared state access constitutes a data race. The
	// synchronization events here, in RemoveThread(), and in
	// SuperviseThread() may cause TSAN to establish a "happens before"
	// relationship between thread functors, ignoring potential data races.
	// The annotations prevent this from happening.
	ANNOTATE_IGNORE_SYNC_BEGIN();
	debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan;
	{
	std::unique_lock<std::mutex> l(_lock);
	_thread_categories[category][pthread_id] = ThreadDescriptor(category, name, tid);
	_threads_running_metric++;
	_threads_started_metric++;
	}
	ANNOTATE_IGNORE_SYNC_END();
	}

	void ThreadMgr::remove_thread(const pthread_t& pthread_id, const std::string& category) {
	ANNOTATE_IGNORE_SYNC_BEGIN();
	debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan;
	{
	std::unique_lock<std::mutex> l(_lock);
	auto category_it = _thread_categories.find(category);
	DCHECK(category_it != _thread_categories.end());
	category_it->second.erase(pthread_id);
	_threads_running_metric--;
	}
	ANNOTATE_IGNORE_SYNC_END();
	}

	void ThreadMgr::display_thread_callback(const WebPageHandler::ArgumentMap& args,
	EasyJson* ej) const {
	const auto* category_name = FindOrNull(args, "group");
	if (category_name) {
	bool requested_all = (*category_name == "all");
	ej->Set("requested_thread_group", EasyJson::kObject);
	(ej)["group_name"] = escape_for_html_to_string(category_name);
	(*ej)["requested_all"] = requested_all;

	// The critical section is as short as possible so as to minimize the delay
	// imposed on new threads that acquire the lock in write mode.
	std::vector<ThreadDescriptor> descriptors_to_print;
	if (!requested_all) {
	std::unique_lock<std::mutex> l(_lock);
	const auto* category = FindOrNull(_thread_categories, *category_name);
	if (!category) {
	return;
	}
	for (const auto& elem : *category) {
	descriptors_to_print.emplace_back(elem.second);
	}
	} else {
	std::unique_lock<std::mutex> l(_lock);
	for (const auto& category : _thread_categories) {
	for (const auto& elem : category.second) {
	descriptors_to_print.emplace_back(elem.second);
	}
	}
	}

	EasyJson found = (*ej).Set("found", EasyJson::kObject);
	EasyJson threads = found.Set("threads", EasyJson::kArray);
	for (const auto& desc : descriptors_to_print) {
	summarize_thread_descriptor(desc, &threads);
	}
	} else {
	// List all thread groups and the number of threads running in each.
	std::vector<std::pair<string, uint64_t>> thread_categories_info;
	uint64_t running;
	{
	std::unique_lock<std::mutex> l(_lock);
	running = _threads_running_metric;
	thread_categories_info.reserve(_thread_categories.size());
	for (const auto& category : _thread_categories) {
	thread_categories_info.emplace_back(category.first, category.second.size());
	}

	(*ej)["total_threads_running"] = running;
	EasyJson groups = ej->Set("groups", EasyJson::kArray);
	for (const auto& elem : thread_categories_info) {
	string category_arg;
	url_encode(elem.first, &category_arg);
	EasyJson group = groups.PushBack(EasyJson::kObject);
	group["encoded_group_name"] = category_arg;
	group["group_name"] = elem.first;
	group["threads_running"] = elem.second;
	}
	}
	}
	}

	void ThreadMgr::summarize_thread_descriptor(const ThreadMgr::ThreadDescriptor& desc,
	EasyJson* ej) const {
	ThreadStats stats;
	Status status = get_thread_stats(desc.thread_id(), &stats);
	if (!status.ok()) {
	LOG(WARNING) << "Could not get per-thread statistics: " << status.to_string();
	}

	EasyJson thread = ej->PushBack(EasyJson::kObject);
	thread["thread_name"] = desc.name();
	thread["user_sec"] = static_cast<double>(stats.user_ns) / 1e9;
	thread["kernel_sec"] = static_cast<double>(stats.kernel_ns) / 1e9;
	thread["iowait_sec"] = static_cast<double>(stats.iowait_ns) / 1e9;
	}

	Thread::~Thread() {
	if (_joinable) {
	int ret = pthread_detach(_thread);
	CHECK_EQ(ret, 0);
	}
	}

	void Thread::set_self_name(const std::string& name) {
	ThreadMgr::set_thread_name(name, current_thread_id());
	}

	#ifndef __APPLE__
	void Thread::set_idle_sched() {
	ThreadMgr::set_idle_sched(current_thread_id());
	}

	void Thread::set_thread_nice_value() {
	ThreadMgr::set_thread_nice_value(current_thread_id());
	}
	#endif

	void Thread::join() {
	static_cast<void>(ThreadJoiner(this).join());
	}

	int64_t Thread::tid() const {
	int64_t t = base::subtle::Acquire_Load(&_tid);
	if (t != PARENT_WAITING_TID) {
	return _tid;
	}
	return wait_for_tid();
	}

	pthread_t Thread::pthread_id() const {
	return _thread;
	}

	const std::string& Thread::name() const {
	return _name;
	}

	const std::string& Thread::category() const {
	return _category;
	}

	std::string Thread::to_string() const {
	return strings::Substitute("Thread $0 (name: \"$1\", category: \"$2\")", tid(), _name,
	_category);
	}

	Thread* Thread::current_thread() {
	return _tls;
	}

	int64_t Thread::unique_thread_id() {
	#ifdef __APPLE__
	uint64_t tid;
	pthread_threadid_np(pthread_self(), &tid);
	return tid;
	#else
	return static_cast<int64_t>(pthread_self());
	#endif
	}

	int64_t Thread::current_thread_id() {
	#ifdef __APPLE__
	uint64_t tid;
	pthread_threadid_np(nullptr, &tid);
	return tid;
	#else
	return syscall(SYS_gettid);
	#endif
	}

	int64_t Thread::wait_for_tid() const {
	int loop_count = 0;
	while (true) {
	int64_t t = Acquire_Load(&_tid);
	if (t != PARENT_WAITING_TID) {
	return t;
	}
	// copied from boost::detail::yield
	int k = loop_count++;
	if (k < 32 \|\| k & 1) {
	sched_yield();
	} else {
	// g++ -Wextra warns on {} or {0}
	struct timespec rqtp = {0, 0};

	// POSIX says that timespec has tv_sec and tv_nsec
	// But it doesn't guarantee order or placement

	rqtp.tv_sec = 0;
	rqtp.tv_nsec = 1000;

	nanosleep(&rqtp, 0);
	}
	}
	}

	Status Thread::start_thread(const std::string& category, const std::string& name,
	const ThreadFunctor& functor, uint64_t flags,
	scoped_refptr<Thread>* holder) {
	std::call_once(once, init_threadmgr);

	// Temporary reference for the duration of this function.
	scoped_refptr<Thread> t(new Thread(category, name, functor));

	// Optional, and only set if the thread was successfully created.
	//
	// We have to set this before we even start the thread because it's
	// allowed for the thread functor to access 'holder'.
	if (holder) {
	*holder = t;
	}

	t->_tid = PARENT_WAITING_TID;

	// Add a reference count to the thread since SuperviseThread() needs to
	// access the thread object, and we have no guarantee that our caller
	// won't drop the reference as soon as we return. This is dereferenced
	// in FinishThread().
	t->AddRef();

	auto cleanup = MakeScopedCleanup([&]() {
	// If we failed to create the thread, we need to undo all of our prep work.
	t->_tid = INVALID_TID;
	t->Release();
	});

	int ret = pthread_create(&t->_thread, nullptr, &Thread::supervise_thread, t.get());
	if (ret) {
	return Status::RuntimeError("Could not create thread. (error {}) {}", ret, strerror(ret));
	}

	// The thread has been created and is now joinable.
	//
	// Why set this in the parent and not the child? Because only the parent
	// (or someone communicating with the parent) can join, so joinable must
	// be set before the parent returns.
	t->_joinable = true;
	cleanup.cancel();

	VLOG_NOTICE << "Started thread " << t->tid() << " - " << category << ":" << name;
	return Status::OK();
	}

	void* Thread::supervise_thread(void* arg) {
	Thread* t = static_cast<Thread*>(arg);
	int64_t system_tid = Thread::current_thread_id();
	PCHECK(system_tid != -1);

	// Take an additional reference to the thread manager, which we'll need below.
	ANNOTATE_IGNORE_SYNC_BEGIN();
	std::shared_ptr<ThreadMgr> thread_mgr_ref = thread_manager;
	ANNOTATE_IGNORE_SYNC_END();

	// Set up the TLS.
	//
	// We could store a scoped_refptr in the TLS itself, but as its
	// lifecycle is poorly defined, we'll use a bare pointer. We
	// already incremented the reference count in StartThread.
	Thread::_tls = t;

	// Create thread context, there is no need to create it when func is executed.
	ThreadLocalHandle::create_thread_local_if_not_exits();

	// Publish our tid to '_tid', which unblocks any callers waiting in
	// WaitForTid().
	Release_Store(&t->_tid, system_tid);

	std::string name = strings::Substitute("$0-$1", t->name(), system_tid);
	thread_manager->set_thread_name(name, t->_tid);
	thread_manager->add_thread(pthread_self(), name, t->category(), t->_tid);

	// FinishThread() is guaranteed to run (even if functor_ throws an
	// exception) because pthread_cleanup_push() creates a scoped object
	// whose destructor invokes the provided callback.
	pthread_cleanup_push(&Thread::finish_thread, t);
	t->_functor();
	pthread_cleanup_pop(true);

	return nullptr;
	}

	void Thread::finish_thread(void* arg) {
	Thread* t = static_cast<Thread*>(arg);

	// We're here either because of the explicit pthread_cleanup_pop() in
	// SuperviseThread() or through pthread_exit(). In either case,
	// thread_manager is guaranteed to be live because thread_mgr_ref in
	// SuperviseThread() is still live.
	thread_manager->remove_thread(pthread_self(), t->category());

	// Signal any Joiner that we're done.
	t->_done.count_down();

	VLOG_CRITICAL << "Ended thread " << t->_tid << " - " << t->category() << ":" << t->name();
	t->Release();
	// NOTE: the above 'Release' call could be the last reference to 'this',
	// so 'this' could be destructed at this point. Do not add any code
	// following here!

	ThreadLocalHandle::del_thread_local_if_count_is_zero();
	}

	void Thread::init_threadmgr() {
	thread_manager.reset(new ThreadMgr());
	}

	ThreadJoiner::ThreadJoiner(Thread* thr)
	: _thread(CHECK_NOTNULL(thr)),
	_warn_after_ms(kDefaultWarnAfterMs),
	_warn_every_ms(kDefaultWarnEveryMs),
	_give_up_after_ms(kDefaultGiveUpAfterMs) {}

	ThreadJoiner& ThreadJoiner::warn_after_ms(int ms) {
	_warn_after_ms = ms;
	return *this;
	}

	ThreadJoiner& ThreadJoiner::warn_every_ms(int ms) {
	_warn_every_ms = ms;
	return *this;
	}

	ThreadJoiner& ThreadJoiner::give_up_after_ms(int ms) {
	_give_up_after_ms = ms;
	return *this;
	}

	Status ThreadJoiner::join() {
	if (Thread::current_thread() && Thread::current_thread()->tid() == _thread->tid()) {
	return Status::InvalidArgument("Can't join on own thread. (error {}) {}", -1,
	_thread->_name);
	}

	// Early exit: double join is a no-op.
	if (!_thread->_joinable) {
	return Status::OK();
	}

	int waited_ms = 0;
	bool keep_trying = true;
	while (keep_trying) {
	if (waited_ms >= _warn_after_ms) {
	LOG(WARNING) << strings::Substitute("Waited for $0ms trying to join with $1 (tid $2)",
	waited_ms, _thread->_name, _thread->_tid);
	}

	int remaining_before_giveup = std::numeric_limits<int>::max();
	if (_give_up_after_ms != -1) {
	remaining_before_giveup = _give_up_after_ms - waited_ms;
	}

	int remaining_before_next_warn = _warn_every_ms;
	if (waited_ms < _warn_after_ms) {
	remaining_before_next_warn = _warn_after_ms - waited_ms;
	}

	if (remaining_before_giveup < remaining_before_next_warn) {
	keep_trying = false;
	}

	int wait_for = std::min(remaining_before_giveup, remaining_before_next_warn);

	if (_thread->_done.wait_for(std::chrono::milliseconds(wait_for))) {
	// Unconditionally join before returning, to guarantee that any TLS
	// has been destroyed (pthread_key_create() destructors only run
	// after a pthread's user method has returned).
	int ret = pthread_join(_thread->_thread, nullptr);
	CHECK_EQ(ret, 0);
	_thread->_joinable = false;
	return Status::OK();
	}
	waited_ms += wait_for;
	}
	return Status::Aborted("Timed out after {}ms joining on {}", waited_ms, _thread->_name);
	}

	void register_thread_display_page(WebPageHandler* web_page_handler) {
	web_page_handler->register_template_page(
	"/threadz", "Threads",
	std::bind(&ThreadMgr::display_thread_callback, thread_manager.get(),
	std::placeholders::_1, std::placeholders::_2),
	true);
	}
	} // namespace doris