src/operator/operator_tune.h - mxnet - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 #ifndef MXNET_OPERATOR_OPERATOR_TUNE_H_
 #define MXNET_OPERATOR_OPERATOR_TUNE_H_

 #include <mshadow/base.h>
 #include <mshadow/tensor.h>
 #include <vector>
 #include <set>
 #include <atomic>
 #include <string>

 // #define MXNET_DEBUG_TUNING_LAUNCH

 #ifdef MXNET_DEBUG_TUNING_LAUNCH
 #include <cxxabi.h>
 template<typename T> inline std::string type_name() {
   const char *name = typeid(T).name();
   int status = -4;  // some arbitrary value to eliminate the compiler warning
   std::unique_ptr<char, void (*)(void *)> res {
     abi::__cxa_demangle(name, nullptr, nullptr, &status),
     &std::free
   };
   if (!status) {
     return res.get();
   }
   return std::move(name);
 }
 #define MXNET_DEBUG_PRINT_UNIQUE_OP(__label$, __op$) \
   { \
     static std::mutex cs; \
     static std::unordered_set<std::string> ops; \
     const std::string name = type_name<__op$>(); \
     if (ops.emplace(name).second) { \
       std::cout << (__label$) << ": " << name << std::endl << std::flush; \
     } \
   }
 #else
 #define MXNET_DEBUG_PRINT_UNIQUE_OP(__label$, __op$) /* */
 #endif

 namespace mxnet {
 namespace op {

 #define WORKLOAD_COUNT_SHIFT  11

 /*!
  * \brief Shared data for all data types being tuned, acts as a base class for the higher-level
  *        templated tunin classes
  */
 class OperatorTuneBase {
  public:
   typedef int64_t duration_t;

  protected:
   /*! \brief Have calculated omp_overhead_ yet? */
   static std::atomic<bool> calculated_;
   /*! \brief Time in nanoseconds for OMP overhead */
   static duration_t omp_overhead_ns_;
   /*! \brief Print debug/trace output for tuning info */
   static bool verbose_tuning_info_;
   /*! \brief Tuning scale factor */
   static double tuning_weight_scale_;

  public:
   typedef std::chrono::high_resolution_clock::time_point Tick;

   /*!
    * \brief Get timestamp for "now"
    * \return Tick object representing the current itmestamp
    */
   static MSHADOW_CINLINE Tick Now() {
     return std::chrono::high_resolution_clock::now();
   }

   /*!
    * \brief Get duration in nanoseconds
    * \param t1 Start time tick
    * \param t2 End time tick
    * \return duration in nanoseconds between t1 and t2
    */
   static MSHADOW_CINLINE duration_t GetDurationInNanoseconds(const Tick &t1, const Tick &t2) {
     return static_cast<duration_t>(
       std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count());
   }

   /*!
    * \brief Get duration in nanoseconds between the given 'since' value and now
    * \param since Reference time which to calculate the duration
    * \return Duration in nanoseconds between the given 'since' value and now
    */
   static MSHADOW_CINLINE duration_t GetDurationInNanoseconds(const Tick &since) {
     return GetDurationInNanoseconds(since, Now());
   }

   /*! \brief Loop size to be timed (single op nanos may be too small to store accurately) */
   static constexpr duration_t WORKLOAD_COUNT = (1 << WORKLOAD_COUNT_SHIFT);

   /*!
    * \brief Timer convenience class, sets start time as "now" in the constructor
    */
   struct Timer {
     /*!
      * \brief Constructor, sets start time
      */
     MSHADOW_CINLINE Timer()
       : start_(OperatorTuneBase::Now()) {}
     /*!
      * \brief Get duration in nanoseconds since construction
      * \return Duration in nanoseconds since construction
      */
     MSHADOW_CINLINE int64_t duration() const {
       return OperatorTuneBase::GetDurationInNanoseconds(start_);
     }

     /*!
      * \brief Reference start time, set in constructor
      */
     const OperatorTuneBase::Tick start_;
   };

   /*!
    * \brief Estimate the time to compute with and without OMP, then return whether OMP is faster
    * \param N - Number of iterations desired
    * \param thread_count - Number of OMP threads available to perform the iterations
    * \returns Whether it's faster to use OMP for these iterations
    */
   inline static bool IsOMPFaster(size_t N, size_t thread_count, const uint64_t serial_workload) {
     if (thread_count >= 2) {
       // Compute serial time required
       const uint64_t total_serial_time_ns = serial_workload >> WORKLOAD_COUNT_SHIFT;

       // Compute time required for OMP + # items per thread
       const uint64_t omp_compute_time_ns = (serial_workload / thread_count) >> WORKLOAD_COUNT_SHIFT;
       const uint64_t total_omp_time_ns = omp_overhead_ns_ + omp_compute_time_ns;

       const bool rc = total_omp_time_ns < total_serial_time_ns;
       return rc;
     }
     return false;
   }
 };

 namespace tune {
 /*!
  * \brief Tuning mode for registered kernel operators
  */
 enum TuningMode {
   kAuto,         // Based upon tuning data, choose whether to use OMP for kernel CPU Launch() loops
   kNeverOMP,     // Don't use OMP for parallelism (legacy behavior for GPU builds)
   kAlwaysOMP     // Don't use OMP for parallelism (legacy behavior for CPU builds)
 };
 }  // namespace tune

 template<typename DType>
 class OperatorTuneByType : public OperatorTuneBase {
  public:
   /*!
    * \brief Set tuning mode
    * \param tuning_mode The tune::TuningMode tuning mode value to set
    */
   static MSHADOW_CINLINE void set_tuning_mode(const tune::TuningMode tuning_mode) {
     // Use const_cast to get past "assigning non-volatile to volatile warning
     const_cast<tune::TuningMode &>(tuning_mode_) = tuning_mode;
   }

   /*!
    * \brief Get the current tuning mode
    * \return tune::TuningMode value for the current tuning mode
    */
   static MSHADOW_CINLINE tune::TuningMode tuning_mode() {
     return const_cast<tune::TuningMode &>(tuning_mode_);
   }

   /*!
    * \brief Determine whether to use OMP based upon both timing and configuration
    * \param N - Number of iterations desired
    * \param thread_count - Number of OMP threads available to perform the iterations
    * \returns Whether it's faster to use OMP for these iterations
    */
   inline static bool UseOMP(size_t N, size_t thread_count, const uint64_t serial_workload) {
 #ifdef MXNET_USE_OPERATOR_TUNING
     switch (tuning_mode()) {
       case tune::kAuto:
         return OperatorTuneBase::IsOMPFaster(N, thread_count, serial_workload);
       case tune::kNeverOMP:
         return false;
       case tune::kAlwaysOMP:
       default:
         return thread_count > 1;
     }
 #else
     return true;
 #endif
   }

  protected:
   /*! \brief Tuning mode */
   static volatile tune::TuningMode tuning_mode_;
 };

 namespace mxnet_op {
 /*!
  * \brief Kernel operator wrapper used for tuning data
  */
 template<typename Operation, typename DType>
 struct tuned_op : public Operation {
   /*! \brief Runtime workload calculation values. Generally, nanoseconds to perform WORKLOAD_COUNT
    *        operations (for unary and binary ops), although they can be anything if the UseOMP()
    *        function is written elsewhere for that op (other than in operator_tune-inl.h)
    *  \remarks This variable generally needs to be implemented somewhere.  Currently this is mostly
    *           done via macros in operator_tune.cc.  If you get undefined reference errors when
    *           linking, then try to use one of the macros in that file to instantiate the required
    *           data/functions
    */
   static std::vector<float> workload_;

   /*!
    * \brief Calls parent class (Operation)'s UseOMP
    * \tparam Args Variable arguments passed
    * \param N Number of iterations
    * \param thread_count Number of threads available
    * \param args Variable arguments passed
    * \return true if OMP parallelism is recommended
    */
   template<typename ...Args>
   static MSHADOW_CINLINE bool UseOMP(size_t N, size_t thread_count, Args... args) {
     return Operation::UseOMP(N, thread_count, args...);
   }

   /*!
    * \brief Call a standard UseOMP() implementation (if it exists). Currently, these
    *        are implemented in operator_tune.cc for standard unary, binary,
    *        and argumentless kernels (i.e. mshadow_op::sqrt)
    * \tparam Args Variable arguments passed
    * \param N Number of iterations
    * \param thread_count Number of threads available
    * \param args Variable arguments passed
    * \return true if OMP parallelism is recommended
    */
   static bool UseOMP(size_t N, size_t thread_count);
 };

 /*!
  * \brief Calculate workload for a given lambda function
  * \tparam Function Lambda type to time for WORKLOAD_COUNT calls
  * \param function Lambda to time for WORKLOAD_COUNT calls
  * \return median workload for function call (nanoseconds for WORKLOAD_COUNT calls)
  */
 template<typename Function>
 inline int64_t get_workload(Function function) {
   std::multiset<int64_t> durations;
   typename OperatorTuneBase::Timer timer;
   for (int pass = 0; pass < 3; ++pass) {
     for (int i = 0; i < OperatorTuneBase::WORKLOAD_COUNT; ++i) {
       function();
     }
   }
   const OperatorTuneBase::duration_t dd = timer.duration();
   durations.insert(dd);
   return *++durations.begin();  // return median value
 }

 struct tunable {};

 }  // namespace mxnet_op
 }  // namespace op
 }  // namespace mxnet

 #endif  // MXNET_OPERATOR_OPERATOR_TUNE_H_
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	#ifndef MXNET_OPERATOR_OPERATOR_TUNE_H_
	#define MXNET_OPERATOR_OPERATOR_TUNE_H_

	#include <mshadow/base.h>
	#include <mshadow/tensor.h>
	#include <vector>
	#include <set>
	#include <atomic>
	#include <string>

	// #define MXNET_DEBUG_TUNING_LAUNCH

	#ifdef MXNET_DEBUG_TUNING_LAUNCH
	#include <cxxabi.h>
	template<typename T> inline std::string type_name() {
	const char *name = typeid(T).name();
	int status = -4; // some arbitrary value to eliminate the compiler warning
	std::unique_ptr<char, void ()(void )> res {
	abi::__cxa_demangle(name, nullptr, nullptr, &status),
	&std::free
	};
	if (!status) {
	return res.get();
	}
	return std::move(name);
	}
	#define MXNET_DEBUG_PRINT_UNIQUE_OP(__label$, __op$) \
	{ \
	static std::mutex cs; \
	static std::unordered_set<std::string> ops; \
	const std::string name = type_name<__op$>(); \
	if (ops.emplace(name).second) { \
	std::cout << (__label$) << ": " << name << std::endl << std::flush; \
	} \
	}
	#else
	#define MXNET_DEBUG_PRINT_UNIQUE_OP(__label$, __op$) /* */
	#endif

	namespace mxnet {
	namespace op {

	#define WORKLOAD_COUNT_SHIFT 11

	/*!
	* \brief Shared data for all data types being tuned, acts as a base class for the higher-level
	* templated tunin classes
	*/
	class OperatorTuneBase {
	public:
	typedef int64_t duration_t;

	protected:
	/! \brief Have calculated omp_overhead_ yet? /
	static std::atomic<bool> calculated_;
	/! \brief Time in nanoseconds for OMP overhead /
	static duration_t omp_overhead_ns_;
	/! \brief Print debug/trace output for tuning info /
	static bool verbose_tuning_info_;
	/! \brief Tuning scale factor /
	static double tuning_weight_scale_;

	public:
	typedef std::chrono::high_resolution_clock::time_point Tick;

	/*!
	* \brief Get timestamp for "now"
	* \return Tick object representing the current itmestamp
	*/
	static MSHADOW_CINLINE Tick Now() {
	return std::chrono::high_resolution_clock::now();
	}

	/*!
	* \brief Get duration in nanoseconds
	* \param t1 Start time tick
	* \param t2 End time tick
	* \return duration in nanoseconds between t1 and t2
	*/
	static MSHADOW_CINLINE duration_t GetDurationInNanoseconds(const Tick &t1, const Tick &t2) {
	return static_cast<duration_t>(
	std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count());
	}

	/*!
	* \brief Get duration in nanoseconds between the given 'since' value and now
	* \param since Reference time which to calculate the duration
	* \return Duration in nanoseconds between the given 'since' value and now
	*/
	static MSHADOW_CINLINE duration_t GetDurationInNanoseconds(const Tick &since) {
	return GetDurationInNanoseconds(since, Now());
	}

	/! \brief Loop size to be timed (single op nanos may be too small to store accurately) /
	static constexpr duration_t WORKLOAD_COUNT = (1 << WORKLOAD_COUNT_SHIFT);

	/*!
	* \brief Timer convenience class, sets start time as "now" in the constructor
	*/
	struct Timer {
	/*!
	* \brief Constructor, sets start time
	*/
	MSHADOW_CINLINE Timer()
	: start_(OperatorTuneBase::Now()) {}
	/*!
	* \brief Get duration in nanoseconds since construction
	* \return Duration in nanoseconds since construction
	*/
	MSHADOW_CINLINE int64_t duration() const {
	return OperatorTuneBase::GetDurationInNanoseconds(start_);
	}

	/*!
	* \brief Reference start time, set in constructor
	*/
	const OperatorTuneBase::Tick start_;
	};

	/*!
	* \brief Estimate the time to compute with and without OMP, then return whether OMP is faster
	* \param N - Number of iterations desired
	* \param thread_count - Number of OMP threads available to perform the iterations
	* \returns Whether it's faster to use OMP for these iterations
	*/
	inline static bool IsOMPFaster(size_t N, size_t thread_count, const uint64_t serial_workload) {
	if (thread_count >= 2) {
	// Compute serial time required
	const uint64_t total_serial_time_ns = serial_workload >> WORKLOAD_COUNT_SHIFT;

	// Compute time required for OMP + # items per thread
	const uint64_t omp_compute_time_ns = (serial_workload / thread_count) >> WORKLOAD_COUNT_SHIFT;
	const uint64_t total_omp_time_ns = omp_overhead_ns_ + omp_compute_time_ns;

	const bool rc = total_omp_time_ns < total_serial_time_ns;
	return rc;
	}
	return false;
	}
	};

	namespace tune {
	/*!
	* \brief Tuning mode for registered kernel operators
	*/
	enum TuningMode {
	kAuto, // Based upon tuning data, choose whether to use OMP for kernel CPU Launch() loops
	kNeverOMP, // Don't use OMP for parallelism (legacy behavior for GPU builds)
	kAlwaysOMP // Don't use OMP for parallelism (legacy behavior for CPU builds)
	};
	} // namespace tune

	template<typename DType>
	class OperatorTuneByType : public OperatorTuneBase {
	public:
	/*!
	* \brief Set tuning mode
	* \param tuning_mode The tune::TuningMode tuning mode value to set
	*/
	static MSHADOW_CINLINE void set_tuning_mode(const tune::TuningMode tuning_mode) {
	// Use const_cast to get past "assigning non-volatile to volatile warning
	const_cast<tune::TuningMode &>(tuning_mode_) = tuning_mode;
	}

	/*!
	* \brief Get the current tuning mode
	* \return tune::TuningMode value for the current tuning mode
	*/
	static MSHADOW_CINLINE tune::TuningMode tuning_mode() {
	return const_cast<tune::TuningMode &>(tuning_mode_);
	}

	/*!
	* \brief Determine whether to use OMP based upon both timing and configuration
	* \param N - Number of iterations desired
	* \param thread_count - Number of OMP threads available to perform the iterations
	* \returns Whether it's faster to use OMP for these iterations
	*/
	inline static bool UseOMP(size_t N, size_t thread_count, const uint64_t serial_workload) {
	#ifdef MXNET_USE_OPERATOR_TUNING
	switch (tuning_mode()) {
	case tune::kAuto:
	return OperatorTuneBase::IsOMPFaster(N, thread_count, serial_workload);
	case tune::kNeverOMP:
	return false;
	case tune::kAlwaysOMP:
	default:
	return thread_count > 1;
	}
	#else
	return true;
	#endif
	}

	protected:
	/! \brief Tuning mode /
	static volatile tune::TuningMode tuning_mode_;
	};

	namespace mxnet_op {
	/*!
	* \brief Kernel operator wrapper used for tuning data
	*/
	template<typename Operation, typename DType>
	struct tuned_op : public Operation {
	/*! \brief Runtime workload calculation values. Generally, nanoseconds to perform WORKLOAD_COUNT
	* operations (for unary and binary ops), although they can be anything if the UseOMP()
	* function is written elsewhere for that op (other than in operator_tune-inl.h)
	* \remarks This variable generally needs to be implemented somewhere. Currently this is mostly
	* done via macros in operator_tune.cc. If you get undefined reference errors when
	* linking, then try to use one of the macros in that file to instantiate the required
	* data/functions
	*/
	static std::vector<float> workload_;

	/*!
	* \brief Calls parent class (Operation)'s UseOMP
	* \tparam Args Variable arguments passed
	* \param N Number of iterations
	* \param thread_count Number of threads available
	* \param args Variable arguments passed
	* \return true if OMP parallelism is recommended
	*/
	template<typename ...Args>
	static MSHADOW_CINLINE bool UseOMP(size_t N, size_t thread_count, Args... args) {
	return Operation::UseOMP(N, thread_count, args...);
	}

	/*!
	* \brief Call a standard UseOMP() implementation (if it exists). Currently, these
	* are implemented in operator_tune.cc for standard unary, binary,
	* and argumentless kernels (i.e. mshadow_op::sqrt)
	* \tparam Args Variable arguments passed
	* \param N Number of iterations
	* \param thread_count Number of threads available
	* \param args Variable arguments passed
	* \return true if OMP parallelism is recommended
	*/
	static bool UseOMP(size_t N, size_t thread_count);
	};

	/*!
	* \brief Calculate workload for a given lambda function
	* \tparam Function Lambda type to time for WORKLOAD_COUNT calls
	* \param function Lambda to time for WORKLOAD_COUNT calls
	* \return median workload for function call (nanoseconds for WORKLOAD_COUNT calls)
	*/
	template<typename Function>
	inline int64_t get_workload(Function function) {
	std::multiset<int64_t> durations;
	typename OperatorTuneBase::Timer timer;
	for (int pass = 0; pass < 3; ++pass) {
	for (int i = 0; i < OperatorTuneBase::WORKLOAD_COUNT; ++i) {
	function();
	}
	}
	const OperatorTuneBase::duration_t dd = timer.duration();
	durations.insert(dd);
	return *++durations.begin(); // return median value
	}

	struct tunable {};

	} // namespace mxnet_op
	} // namespace op
	} // namespace mxnet

	#endif // MXNET_OPERATOR_OPERATOR_TUNE_H_