blob: 6e73ed371193d6bdb00dda7a5c7024b809d49b95 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#ifndef MXNET_OPERATOR_OPERATOR_TUNE_H_
#define MXNET_OPERATOR_OPERATOR_TUNE_H_
#include <mshadow/base.h>
#include <mshadow/tensor.h>
#include <vector>
#include <set>
#include <atomic>
#include <string>
// #define MXNET_DEBUG_TUNING_LAUNCH
#ifdef MXNET_DEBUG_TUNING_LAUNCH
#include <cxxabi.h>
template<typename T> inline std::string type_name() {
const char *name = typeid(T).name();
int status = -4; // some arbitrary value to eliminate the compiler warning
std::unique_ptr<char, void (*)(void *)> res {
abi::__cxa_demangle(name, nullptr, nullptr, &status),
&std::free
};
if (!status) {
return res.get();
}
return std::move(name);
}
#define MXNET_DEBUG_PRINT_UNIQUE_OP(__label$, __op$) \
{ \
static std::mutex cs; \
static std::unordered_set<std::string> ops; \
const std::string name = type_name<__op$>(); \
if (ops.emplace(name).second) { \
std::cout << (__label$) << ": " << name << std::endl << std::flush; \
} \
}
#else
#define MXNET_DEBUG_PRINT_UNIQUE_OP(__label$, __op$) /* */
#endif
namespace mxnet {
namespace op {
#define WORKLOAD_COUNT_SHIFT 11
/*!
* \brief Shared data for all data types being tuned, acts as a base class for the higher-level
* templated tunin classes
*/
class OperatorTuneBase {
public:
typedef int64_t duration_t;
protected:
/*! \brief Have calculated omp_overhead_ yet? */
static std::atomic<bool> calculated_;
/*! \brief Time in nanoseconds for OMP overhead */
static duration_t omp_overhead_ns_;
/*! \brief Print debug/trace output for tuning info */
static bool verbose_tuning_info_;
/*! \brief Tuning scale factor */
static double tuning_weight_scale_;
public:
typedef std::chrono::high_resolution_clock::time_point Tick;
/*!
* \brief Get timestamp for "now"
* \return Tick object representing the current itmestamp
*/
static MSHADOW_CINLINE Tick Now() {
return std::chrono::high_resolution_clock::now();
}
/*!
* \brief Get duration in nanoseconds
* \param t1 Start time tick
* \param t2 End time tick
* \return duration in nanoseconds between t1 and t2
*/
static MSHADOW_CINLINE duration_t GetDurationInNanoseconds(const Tick &t1, const Tick &t2) {
return static_cast<duration_t>(
std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count());
}
/*!
* \brief Get duration in nanoseconds between the given 'since' value and now
* \param since Reference time which to calculate the duration
* \return Duration in nanoseconds between the given 'since' value and now
*/
static MSHADOW_CINLINE duration_t GetDurationInNanoseconds(const Tick &since) {
return GetDurationInNanoseconds(since, Now());
}
/*! \brief Loop size to be timed (single op nanos may be too small to store accurately) */
static constexpr duration_t WORKLOAD_COUNT = (1 << WORKLOAD_COUNT_SHIFT);
/*!
* \brief Timer convenience class, sets start time as "now" in the constructor
*/
struct Timer {
/*!
* \brief Constructor, sets start time
*/
MSHADOW_CINLINE Timer()
: start_(OperatorTuneBase::Now()) {}
/*!
* \brief Get duration in nanoseconds since construction
* \return Duration in nanoseconds since construction
*/
MSHADOW_CINLINE int64_t duration() const {
return OperatorTuneBase::GetDurationInNanoseconds(start_);
}
/*!
* \brief Reference start time, set in constructor
*/
const OperatorTuneBase::Tick start_;
};
/*!
* \brief Estimate the time to compute with and without OMP, then return whether OMP is faster
* \param N - Number of iterations desired
* \param thread_count - Number of OMP threads available to perform the iterations
* \returns Whether it's faster to use OMP for these iterations
*/
inline static bool IsOMPFaster(size_t N, size_t thread_count, const uint64_t serial_workload) {
if (thread_count >= 2) {
// Compute serial time required
const uint64_t total_serial_time_ns = serial_workload >> WORKLOAD_COUNT_SHIFT;
// Compute time required for OMP + # items per thread
const uint64_t omp_compute_time_ns = (serial_workload / thread_count) >> WORKLOAD_COUNT_SHIFT;
const uint64_t total_omp_time_ns = omp_overhead_ns_ + omp_compute_time_ns;
const bool rc = total_omp_time_ns < total_serial_time_ns;
return rc;
}
return false;
}
};
namespace tune {
/*!
* \brief Tuning mode for registered kernel operators
*/
enum TuningMode {
kAuto, // Based upon tuning data, choose whether to use OMP for kernel CPU Launch() loops
kNeverOMP, // Don't use OMP for parallelism (legacy behavior for GPU builds)
kAlwaysOMP // Don't use OMP for parallelism (legacy behavior for CPU builds)
};
} // namespace tune
template<typename DType>
class OperatorTuneByType : public OperatorTuneBase {
public:
/*!
* \brief Set tuning mode
* \param tuning_mode The tune::TuningMode tuning mode value to set
*/
static MSHADOW_CINLINE void set_tuning_mode(const tune::TuningMode tuning_mode) {
// Use const_cast to get past "assigning non-volatile to volatile warning
const_cast<tune::TuningMode &>(tuning_mode_) = tuning_mode;
}
/*!
* \brief Get the current tuning mode
* \return tune::TuningMode value for the current tuning mode
*/
static MSHADOW_CINLINE tune::TuningMode tuning_mode() {
return const_cast<tune::TuningMode &>(tuning_mode_);
}
/*!
* \brief Determine whether to use OMP based upon both timing and configuration
* \param N - Number of iterations desired
* \param thread_count - Number of OMP threads available to perform the iterations
* \returns Whether it's faster to use OMP for these iterations
*/
inline static bool UseOMP(size_t N, size_t thread_count, const uint64_t serial_workload) {
#ifdef MXNET_USE_OPERATOR_TUNING
switch (tuning_mode()) {
case tune::kAuto:
return OperatorTuneBase::IsOMPFaster(N, thread_count, serial_workload);
case tune::kNeverOMP:
return false;
case tune::kAlwaysOMP:
default:
return thread_count > 1;
}
#else
return true;
#endif
}
protected:
/*! \brief Tuning mode */
static volatile tune::TuningMode tuning_mode_;
};
namespace mxnet_op {
/*!
* \brief Kernel operator wrapper used for tuning data
*/
template<typename Operation, typename DType>
struct tuned_op : public Operation {
/*! \brief Runtime workload calculation values. Generally, nanoseconds to perform WORKLOAD_COUNT
* operations (for unary and binary ops), although they can be anything if the UseOMP()
* function is written elsewhere for that op (other than in operator_tune-inl.h)
* \remarks This variable generally needs to be implemented somewhere. Currently this is mostly
* done via macros in operator_tune.cc. If you get undefined reference errors when
* linking, then try to use one of the macros in that file to instantiate the required
* data/functions
*/
static std::vector<float> workload_;
/*!
* \brief Calls parent class (Operation)'s UseOMP
* \tparam Args Variable arguments passed
* \param N Number of iterations
* \param thread_count Number of threads available
* \param args Variable arguments passed
* \return true if OMP parallelism is recommended
*/
template<typename ...Args>
static MSHADOW_CINLINE bool UseOMP(size_t N, size_t thread_count, Args... args) {
return Operation::UseOMP(N, thread_count, args...);
}
/*!
* \brief Call a standard UseOMP() implementation (if it exists). Currently, these
* are implemented in operator_tune.cc for standard unary, binary,
* and argumentless kernels (i.e. mshadow_op::sqrt)
* \tparam Args Variable arguments passed
* \param N Number of iterations
* \param thread_count Number of threads available
* \param args Variable arguments passed
* \return true if OMP parallelism is recommended
*/
static bool UseOMP(size_t N, size_t thread_count);
};
/*!
* \brief Calculate workload for a given lambda function
* \tparam Function Lambda type to time for WORKLOAD_COUNT calls
* \param function Lambda to time for WORKLOAD_COUNT calls
* \return median workload for function call (nanoseconds for WORKLOAD_COUNT calls)
*/
template<typename Function>
inline int64_t get_workload(Function function) {
std::multiset<int64_t> durations;
typename OperatorTuneBase::Timer timer;
for (int pass = 0; pass < 3; ++pass) {
for (int i = 0; i < OperatorTuneBase::WORKLOAD_COUNT; ++i) {
function();
}
}
const OperatorTuneBase::duration_t dd = timer.duration();
durations.insert(dd);
return *++durations.begin(); // return median value
}
struct tunable {};
} // namespace mxnet_op
} // namespace op
} // namespace mxnet
#endif // MXNET_OPERATOR_OPERATOR_TUNE_H_