src/runtime/threading_backend.cc - tvm - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file threading_backend.cc
  * \brief Native threading backend
  */
 #include <dmlc/logging.h>
 #include <tvm/runtime/threading_backend.h>

 #include <algorithm>
 #include <thread>
 #if defined(__linux__) || defined(__ANDROID__)
 #include <fstream>
 #include <sstream>
 #else
 #endif
 #if defined(__linux__)
 #include <sched.h>
 #endif
 #if defined(__hexagon__)
 #include <dlfcn.h>
 #endif

 namespace tvm {
 namespace runtime {
 namespace threading {

 class ThreadGroup::Impl {
  public:
   Impl(int num_workers, std::function<void(int)> worker_callback, bool exclude_worker0)
       : num_workers_(num_workers) {
     CHECK_GE(num_workers, 1) << "Requested a non-positive number of worker threads.";
     for (int i = exclude_worker0; i < num_workers_; ++i) {
       threads_.emplace_back([worker_callback, i] { worker_callback(i); });
     }
     InitSortedOrder();
   }
   ~Impl() { Join(); }

   void Join() {
     for (auto& t : threads_) {
       if (t.joinable()) t.join();
     }
   }

   int Configure(AffinityMode mode, int nthreads, bool exclude_worker0) {
     int num_workers_used = 0;
     if (mode == kLittle) {
       num_workers_used = little_count_;
     } else if (mode == kBig) {
       num_workers_used = big_count_;
     } else {
       // use default
       num_workers_used = threading::MaxConcurrency();
     }
     // if a specific number was given, use that
     if (nthreads) {
       num_workers_used = nthreads;
     }
     // if MaxConcurrency restricted the number of workers (e.g., due to
     // hyperthreading), respect the restriction. On CPUs with N logical cores
     // and N/2 physical cores this will set affinity to the first N/2 logical
     // ones.
     num_workers_used = std::min(num_workers_, num_workers_used);

     const char* val = getenv("TVM_BIND_THREADS");
     if (val == nullptr || atoi(val) == 1) {
       // Do not set affinity if there are more workers than found cores
       if (sorted_order_.size() >= static_cast<unsigned int>(num_workers_)) {
         SetAffinity(exclude_worker0, mode == kLittle);
       } else {
         LOG(WARNING) << "The thread affinity cannot be set when the number of workers"
                      << "is larger than the number of available cores in the system.";
       }
     }
     return num_workers_used;
   }

  private:
   // bind worker threads to disjoint cores
   // if worker 0 is offloaded to master, i.e. exclude_worker0 is true,
   // the master thread is bound to core 0.
   void SetAffinity(bool exclude_worker0, bool reverse = false) {
 #if defined(__ANDROID__)
 #ifndef CPU_SET
 #define CPU_SETSIZE 1024
 #define __NCPUBITS (8 * sizeof(uint64_t))
     typedef struct {
       uint64_t __bits[CPU_SETSIZE / __NCPUBITS];
     } cpu_set_t;

 #define CPU_SET(cpu, cpusetp) \
   ((cpusetp)->__bits[(cpu) / __NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS)))
 #define CPU_ZERO(cpusetp) memset((cpusetp), 0, sizeof(cpu_set_t))
 #endif
 #endif
 #if defined(__linux__) || defined(__ANDROID__)
     CHECK_GE(sorted_order_.size(), num_workers_);

     for (unsigned i = 0; i < threads_.size(); ++i) {
       unsigned core_id;
       if (reverse) {
         core_id = sorted_order_[sorted_order_.size() - (i + exclude_worker0) - 1];
       } else {
         core_id = sorted_order_[i + exclude_worker0];
       }
       cpu_set_t cpuset;
       CPU_ZERO(&cpuset);
       CPU_SET(core_id, &cpuset);
 #if defined(__ANDROID__)
       sched_setaffinity(threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset);
 #else
       pthread_setaffinity_np(threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset);
 #endif
     }
     if (exclude_worker0) {  // master thread run task
       // Master thread will have free migration on needed cores.
       // Typically, the OS will schedule the master thread to run at core 0,
       // which is idle, when other workers are running.
       // See the comment inside SetMasterThreadFullCpuAffinity function to get more detail.
       SetMasterThreadFullCpuAffinity(reverse);
     }
 #endif
   }

   void SetMasterThreadFullCpuAffinity(bool reverse) {
 #if defined(__linux__) || defined(__ANDROID__)
     cpu_set_t cpuset;
     CPU_ZERO(&cpuset);
     // For example, we have 2xA72 + 4xA53 (id is 0 - 5, 4, 5 is A72 big core)
     // And we use config_threadpool API to set we will only use 4xA53.
     // The sorted_order will be [4, 5, 0, 1, 2, 3].
     // When to call this API, we have spawn threads on little cores for other workers
     // in SetAffinity function. And for tvm master thread, it should also run on little cores,
     // not big cores (4, 5).

     // Note: this works well on x86 too. Because x86 doesn't have BIG.LITTLE,
     // our implementation will use kBig mode by default and will let master thread
     // run on intended cores.
     if (reverse) {
       for (int i = 0; i < little_count_; ++i) {
         CPU_SET(sorted_order_[sorted_order_.size() - i - 1], &cpuset);
       }
     } else {
       int num_cpu_workers = std::min(MaxConcurrency(), big_count_);
       for (int i = 0; i < num_cpu_workers; ++i) {
         CPU_SET(sorted_order_[i], &cpuset);
       }
     }
 #if defined(__ANDROID__)
     sched_setaffinity(pthread_self(), sizeof(cpu_set_t), &cpuset);
 #else
     pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
 #endif
 #endif
   }

   void InitSortedOrder() {
     unsigned int threads = std::thread::hardware_concurrency();
 #if defined(__hexagon__)
     // With unsigned PDs, getting the number of available hardware threads
     // is not supported in earlier versions of QuRT. In such cases assume 4.
     if (threads == 0) threads = 4;
 #endif
     std::vector<std::pair<unsigned int, int64_t> > max_freqs;

     for (unsigned int i = 0; i < threads; ++i) {
       int64_t cur_freq = 0;
 #if defined(__linux__) || defined(__ANDROID__)
       std::ostringstream filepath;
       filepath << "/sys/devices/system/cpu/cpu" << i << "/cpufreq/cpuinfo_max_freq";
       std::ifstream ifs(filepath.str());
       if (!ifs.fail()) {
         if (!(ifs >> cur_freq)) {
           cur_freq = -1;
         }
         ifs.close();
       }
 #endif
       max_freqs.push_back(std::make_pair(i, cur_freq));
     }

     auto fcmpbyfreq = [](const std::pair<unsigned int, int64_t>& a,
                          const std::pair<unsigned int, int64_t>& b) {
       return a.second == b.second ? a.first < b.first : a.second > b.second;
     };
     std::sort(max_freqs.begin(), max_freqs.end(), fcmpbyfreq);
     int64_t big_freq = max_freqs.begin()->second;
     int64_t little_freq = max_freqs.rbegin()->second;
     for (auto it = max_freqs.begin(); it != max_freqs.end(); it++) {
       sorted_order_.push_back(it->first);
       if (big_freq == it->second) {
         big_count_++;
       }
       if (big_freq != little_freq && little_freq == it->second) {
         little_count_++;
       }
     }
     if (big_count_ + little_count_ != static_cast<int>(sorted_order_.size())) {
       LOG(WARNING) << "more than two frequencies detected!";
     }
   }

   int num_workers_;
   std::vector<std::thread> threads_;
   std::vector<unsigned int> sorted_order_;
   int big_count_ = 0;
   int little_count_ = 0;
 };

 ThreadGroup::ThreadGroup(int num_workers, std::function<void(int)> worker_callback,
                          bool exclude_worker0)
     : impl_(new ThreadGroup::Impl(num_workers, worker_callback, exclude_worker0)) {}
 ThreadGroup::~ThreadGroup() { delete impl_; }
 void ThreadGroup::Join() { impl_->Join(); }

 int ThreadGroup::Configure(AffinityMode mode, int nthreads, bool exclude_worker0) {
   return impl_->Configure(mode, nthreads, exclude_worker0);
 }

 void Yield() { std::this_thread::yield(); }

 int MaxConcurrency() {
   int max_concurrency = 1;
   const char* val = getenv("TVM_NUM_THREADS");
   if (val == nullptr) {
     val = getenv("OMP_NUM_THREADS");
   }
   if (val != nullptr) {
     max_concurrency = atoi(val);
   } else {
     max_concurrency = std::thread::hardware_concurrency();
 #if defined(_M_X64) || defined(__x86_64__)
     max_concurrency /= 2;  // ignore hyper-threading
 #elif defined(__hexagon__)
     // With unsigned PDs, getting the number of available hardware threads
     // is not supported in earlier versions of QuRT. In such cases assume 4.
     // If running on simulator, set max_concurrency to 1.
     if (max_concurrency == 0) {
       if (dlsym(RTLD_DEFAULT, "running_in_sim_dev_17bc90206f6cf5a7")) {
         max_concurrency = 1;
       } else {
         max_concurrency = 4;
       }
     }
 #endif
   }
   return std::max(max_concurrency, 1);
 }

 }  // namespace threading
 }  // namespace runtime
 }  // namespace tvm
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file threading_backend.cc
	* \brief Native threading backend
	*/
	#include <dmlc/logging.h>
	#include <tvm/runtime/threading_backend.h>

	#include <algorithm>
	#include <thread>
	#if defined(__linux__) \|\| defined(__ANDROID__)
	#include <fstream>
	#include <sstream>
	#else
	#endif
	#if defined(__linux__)
	#include <sched.h>
	#endif
	#if defined(__hexagon__)
	#include <dlfcn.h>
	#endif

	namespace tvm {
	namespace runtime {
	namespace threading {

	class ThreadGroup::Impl {
	public:
	Impl(int num_workers, std::function<void(int)> worker_callback, bool exclude_worker0)
	: num_workers_(num_workers) {
	CHECK_GE(num_workers, 1) << "Requested a non-positive number of worker threads.";
	for (int i = exclude_worker0; i < num_workers_; ++i) {
	threads_.emplace_back([worker_callback, i] { worker_callback(i); });
	}
	InitSortedOrder();
	}
	~Impl() { Join(); }

	void Join() {
	for (auto& t : threads_) {
	if (t.joinable()) t.join();
	}
	}

	int Configure(AffinityMode mode, int nthreads, bool exclude_worker0) {
	int num_workers_used = 0;
	if (mode == kLittle) {
	num_workers_used = little_count_;
	} else if (mode == kBig) {
	num_workers_used = big_count_;
	} else {
	// use default
	num_workers_used = threading::MaxConcurrency();
	}
	// if a specific number was given, use that
	if (nthreads) {
	num_workers_used = nthreads;
	}
	// if MaxConcurrency restricted the number of workers (e.g., due to
	// hyperthreading), respect the restriction. On CPUs with N logical cores
	// and N/2 physical cores this will set affinity to the first N/2 logical
	// ones.
	num_workers_used = std::min(num_workers_, num_workers_used);

	const char* val = getenv("TVM_BIND_THREADS");
	if (val == nullptr \|\| atoi(val) == 1) {
	// Do not set affinity if there are more workers than found cores
	if (sorted_order_.size() >= static_cast<unsigned int>(num_workers_)) {
	SetAffinity(exclude_worker0, mode == kLittle);
	} else {
	LOG(WARNING) << "The thread affinity cannot be set when the number of workers"
	<< "is larger than the number of available cores in the system.";
	}
	}
	return num_workers_used;
	}

	private:
	// bind worker threads to disjoint cores
	// if worker 0 is offloaded to master, i.e. exclude_worker0 is true,
	// the master thread is bound to core 0.
	void SetAffinity(bool exclude_worker0, bool reverse = false) {
	#if defined(__ANDROID__)
	#ifndef CPU_SET
	#define CPU_SETSIZE 1024
	#define __NCPUBITS (8 * sizeof(uint64_t))
	typedef struct {
	uint64_t __bits[CPU_SETSIZE / __NCPUBITS];
	} cpu_set_t;

	#define CPU_SET(cpu, cpusetp) \
	((cpusetp)->__bits[(cpu) / __NCPUBITS] \|= (1UL << ((cpu) % __NCPUBITS)))
	#define CPU_ZERO(cpusetp) memset((cpusetp), 0, sizeof(cpu_set_t))
	#endif
	#endif
	#if defined(__linux__) \|\| defined(__ANDROID__)
	CHECK_GE(sorted_order_.size(), num_workers_);

	for (unsigned i = 0; i < threads_.size(); ++i) {
	unsigned core_id;
	if (reverse) {
	core_id = sorted_order_[sorted_order_.size() - (i + exclude_worker0) - 1];
	} else {
	core_id = sorted_order_[i + exclude_worker0];
	}
	cpu_set_t cpuset;
	CPU_ZERO(&cpuset);
	CPU_SET(core_id, &cpuset);
	#if defined(__ANDROID__)
	sched_setaffinity(threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset);
	#else
	pthread_setaffinity_np(threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset);
	#endif
	}
	if (exclude_worker0) { // master thread run task
	// Master thread will have free migration on needed cores.
	// Typically, the OS will schedule the master thread to run at core 0,
	// which is idle, when other workers are running.
	// See the comment inside SetMasterThreadFullCpuAffinity function to get more detail.
	SetMasterThreadFullCpuAffinity(reverse);
	}
	#endif
	}

	void SetMasterThreadFullCpuAffinity(bool reverse) {
	#if defined(__linux__) \|\| defined(__ANDROID__)
	cpu_set_t cpuset;
	CPU_ZERO(&cpuset);
	// For example, we have 2xA72 + 4xA53 (id is 0 - 5, 4, 5 is A72 big core)
	// And we use config_threadpool API to set we will only use 4xA53.
	// The sorted_order will be [4, 5, 0, 1, 2, 3].
	// When to call this API, we have spawn threads on little cores for other workers
	// in SetAffinity function. And for tvm master thread, it should also run on little cores,
	// not big cores (4, 5).

	// Note: this works well on x86 too. Because x86 doesn't have BIG.LITTLE,
	// our implementation will use kBig mode by default and will let master thread
	// run on intended cores.
	if (reverse) {
	for (int i = 0; i < little_count_; ++i) {
	CPU_SET(sorted_order_[sorted_order_.size() - i - 1], &cpuset);
	}
	} else {
	int num_cpu_workers = std::min(MaxConcurrency(), big_count_);
	for (int i = 0; i < num_cpu_workers; ++i) {
	CPU_SET(sorted_order_[i], &cpuset);
	}
	}
	#if defined(__ANDROID__)
	sched_setaffinity(pthread_self(), sizeof(cpu_set_t), &cpuset);
	#else
	pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
	#endif
	#endif
	}

	void InitSortedOrder() {
	unsigned int threads = std::thread::hardware_concurrency();
	#if defined(__hexagon__)
	// With unsigned PDs, getting the number of available hardware threads
	// is not supported in earlier versions of QuRT. In such cases assume 4.
	if (threads == 0) threads = 4;
	#endif
	std::vector<std::pair<unsigned int, int64_t> > max_freqs;

	for (unsigned int i = 0; i < threads; ++i) {
	int64_t cur_freq = 0;
	#if defined(__linux__) \|\| defined(__ANDROID__)
	std::ostringstream filepath;
	filepath << "/sys/devices/system/cpu/cpu" << i << "/cpufreq/cpuinfo_max_freq";
	std::ifstream ifs(filepath.str());
	if (!ifs.fail()) {
	if (!(ifs >> cur_freq)) {
	cur_freq = -1;
	}
	ifs.close();
	}
	#endif
	max_freqs.push_back(std::make_pair(i, cur_freq));
	}

	auto fcmpbyfreq = [](const std::pair<unsigned int, int64_t>& a,
	const std::pair<unsigned int, int64_t>& b) {
	return a.second == b.second ? a.first < b.first : a.second > b.second;
	};
	std::sort(max_freqs.begin(), max_freqs.end(), fcmpbyfreq);
	int64_t big_freq = max_freqs.begin()->second;
	int64_t little_freq = max_freqs.rbegin()->second;
	for (auto it = max_freqs.begin(); it != max_freqs.end(); it++) {
	sorted_order_.push_back(it->first);
	if (big_freq == it->second) {
	big_count_++;
	}
	if (big_freq != little_freq && little_freq == it->second) {
	little_count_++;
	}
	}
	if (big_count_ + little_count_ != static_cast<int>(sorted_order_.size())) {
	LOG(WARNING) << "more than two frequencies detected!";
	}
	}

	int num_workers_;
	std::vector<std::thread> threads_;
	std::vector<unsigned int> sorted_order_;
	int big_count_ = 0;
	int little_count_ = 0;
	};

	ThreadGroup::ThreadGroup(int num_workers, std::function<void(int)> worker_callback,
	bool exclude_worker0)
	: impl_(new ThreadGroup::Impl(num_workers, worker_callback, exclude_worker0)) {}
	ThreadGroup::~ThreadGroup() { delete impl_; }
	void ThreadGroup::Join() { impl_->Join(); }

	int ThreadGroup::Configure(AffinityMode mode, int nthreads, bool exclude_worker0) {
	return impl_->Configure(mode, nthreads, exclude_worker0);
	}

	void Yield() { std::this_thread::yield(); }

	int MaxConcurrency() {
	int max_concurrency = 1;
	const char* val = getenv("TVM_NUM_THREADS");
	if (val == nullptr) {
	val = getenv("OMP_NUM_THREADS");
	}
	if (val != nullptr) {
	max_concurrency = atoi(val);
	} else {
	max_concurrency = std::thread::hardware_concurrency();
	#if defined(_M_X64) \|\| defined(__x86_64__)
	max_concurrency /= 2; // ignore hyper-threading
	#elif defined(__hexagon__)
	// With unsigned PDs, getting the number of available hardware threads
	// is not supported in earlier versions of QuRT. In such cases assume 4.
	// If running on simulator, set max_concurrency to 1.
	if (max_concurrency == 0) {
	if (dlsym(RTLD_DEFAULT, "running_in_sim_dev_17bc90206f6cf5a7")) {
	max_concurrency = 1;
	} else {
	max_concurrency = 4;
	}
	}
	#endif
	}
	return std::max(max_concurrency, 1);
	}

	} // namespace threading
	} // namespace runtime
	} // namespace tvm