blob: 1358d0ba8adac84bc1d711809304f70be2b0e936 [file] [log] [blame]
//
// Created by coolivie on 11/25/17.
//
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file c_api_profile.cc
* \brief C API of mxnet profiler and support functions
*/
#include <dmlc/base.h>
#include <dmlc/logging.h>
#include <dmlc/thread_group.h>
#include <mxnet/kvstore.h>
#include <stack>
#include "./c_api_common.h"
#include "../profiler/storage_profiler.h"
#include "../profiler/profiler.h"
namespace mxnet {
static profiler::ProfileDomain api_domain("MXNET_C_API");
static profiler::ProfileCounter api_call_counter("MXNet C API Calls", &api_domain);
static profiler::ProfileCounter api_concurrency_counter("MXNet C API Concurrency", &api_domain);
/*! \brief Per-API-call timing data */
struct APICallTimingData {
const char* name_;
profiler::ProfileTask* task_;
};
/*!
* \brief Per-thread profiling data
*/
class ProfilingThreadData {
public:
/*!
* \brief Constructor, nothrow
*/
inline ProfilingThreadData() = default;
/*!
* \brief Retreive ProfileTask object of the given name, or create if it doesn't exist
* \param name Name of the task
* \param domain Domain of the task
* \return Pointer to the stored or created ProfileTask object
*/
profiler::ProfileTask* profile_task(const char* name, profiler::ProfileDomain* domain) {
// Per-thread so no lock necessary
auto iter = tasks_.find(name);
if (iter == tasks_.end()) {
iter =
tasks_
.emplace(std::make_pair(name, std::make_unique<profiler::ProfileTask>(name, domain)))
.first;
}
return iter->second.get();
}
/*! \brief nestable call stack */
std::stack<APICallTimingData> calls_;
/*! \brief Whether profiling actions should be ignored/excluded */
volatile bool ignore_call_ = false; // same-thread only, so not atomic
private:
/*! \brief tasks */
std::unordered_map<std::string, std::unique_ptr<profiler::ProfileTask>> tasks_;
};
#if DMLC_CXX11_THREAD_LOCAL
static thread_local ProfilingThreadData thread_profiling_data;
#else
static MX_THREAD_LOCAL ProfilingThreadData thread_profiling_data;
#endif
extern void on_enter_api(const char* function) {
if (profiler::Profiler::Get()->IsProfiling(profiler::Profiler::kAPI)) {
if (!thread_profiling_data.ignore_call_) {
++api_call_counter;
++api_concurrency_counter;
APICallTimingData data = {function,
thread_profiling_data.profile_task(function, &api_domain)};
thread_profiling_data.calls_.push(data);
data.task_->start();
}
}
}
extern void on_exit_api() {
if (profiler::Profiler::Get()->IsProfiling(profiler::Profiler::kAPI)) {
if (!thread_profiling_data.ignore_call_) {
CHECK(!thread_profiling_data.calls_.empty());
APICallTimingData data = thread_profiling_data.calls_.top();
data.task_->stop();
thread_profiling_data.calls_.pop();
--api_concurrency_counter;
}
}
}
/*!
* \brief Don't profile calls in this scope using RAII
*/
struct IgnoreProfileCallScope {
IgnoreProfileCallScope() {
DCHECK_EQ(thread_profiling_data.ignore_call_, false);
thread_profiling_data.ignore_call_ = true;
}
~IgnoreProfileCallScope() {
DCHECK_EQ(thread_profiling_data.ignore_call_, true);
thread_profiling_data.ignore_call_ = false;
}
};
} // namespace mxnet
/*!
* \brief Simple global profile objects created from Python
* \note These mutexes will almost never have a collision, so internal futexes will be able
* to lock in user mode (good performance)
* I would use dmlc::SpinLock, except that I am concerned that if conditions change and
* there are frequent collisions (ie multithreaded inference), then the spin locks may
* start burning CPU unnoticed
*/
struct PythonProfileObjects {
// These will almost never collide, so locking will happen in user-space (at least on Linux)
// since pthreads uses futexes.
std::mutex cs_domains_;
std::mutex cs_counters_;
std::mutex cs_tasks_;
std::mutex cs_frames_;
std::mutex cs_events_;
std::list<std::shared_ptr<profiler::ProfileDomain>> domains_;
std::unordered_map<profiler::ProfileCounter*, std::shared_ptr<profiler::ProfileCounter>>
counters_;
std::unordered_map<profiler::ProfileDuration*, std::shared_ptr<profiler::ProfileDuration>> tasks_;
std::unordered_map<profiler::ProfileDuration*, std::shared_ptr<profiler::ProfileDuration>>
frames_;
std::unordered_map<profiler::ProfileDuration*, std::shared_ptr<profiler::ProfileDuration>>
events_;
};
static PythonProfileObjects python_profile_objects;
enum class ProfileProcess { kWorker, kServer };
enum class PrintFormat { table, json };
struct ProfileConfigParam : public dmlc::Parameter<ProfileConfigParam> {
bool profile_all;
bool profile_symbolic;
bool profile_imperative;
bool profile_memory;
bool profile_api;
std::string filename;
std::string gpu_memory_profile_filename_prefix;
bool continuous_dump;
float dump_period;
bool aggregate_stats;
int profile_process;
DMLC_DECLARE_PARAMETER(ProfileConfigParam) {
DMLC_DECLARE_FIELD(profile_all).set_default(false).describe("Profile all. Default is False.");
DMLC_DECLARE_FIELD(profile_symbolic)
.set_default(true)
.describe("Profile symbolic operators. Default is True.");
DMLC_DECLARE_FIELD(profile_imperative)
.set_default(true)
.describe("Profile imperative operators. Default is True.");
DMLC_DECLARE_FIELD(profile_memory)
.set_default(true)
.describe("Profile memory. Default is True.");
DMLC_DECLARE_FIELD(profile_api).set_default(true).describe("Profile C API. Default is True.");
DMLC_DECLARE_FIELD(filename)
.set_default("profile.json")
.describe("File name to write profiling info.");
#if MXNET_USE_CUDA
DMLC_DECLARE_FIELD(gpu_memory_profile_filename_prefix)
.set_default("gpu_memory_profile")
.describe("File name prefix to write GPU memory profile info.");
#endif // MXNET_USE_CUDA
DMLC_DECLARE_FIELD(continuous_dump)
.set_default(true)
.describe(
"Periodically dump (and append) profiling data to file while running. "
"Default is True.");
DMLC_DECLARE_FIELD(dump_period)
.set_default(1.0f)
.describe(
"When continuous dump is enabled, the period between subsequent "
"profile info dumping.");
DMLC_DECLARE_FIELD(aggregate_stats)
.set_default(false)
.describe(
"Maintain aggregate stats, required for MXDumpAggregateStats. Note that "
"this can have a negative performance impact. Default is False.");
DMLC_DECLARE_FIELD(profile_process)
.add_enum("worker", static_cast<int>(ProfileProcess::kWorker))
.add_enum("server", static_cast<int>(ProfileProcess::kServer))
.set_default(static_cast<int>(ProfileProcess::kWorker))
.describe(
"Specifies which process to profile: "
"worker: this is default. for single node training it should always be worker."
"server: for distributed training, this profiles server process");
}
};
DMLC_REGISTER_PARAMETER(ProfileConfigParam);
struct ProfileMarkerScopeParam : public dmlc::Parameter<ProfileMarkerScopeParam> {
int scope;
DMLC_DECLARE_PARAMETER(ProfileMarkerScopeParam) {
DMLC_DECLARE_FIELD(scope)
.set_default(profiler::ProfileMarker::kProcess)
.add_enum("global", profiler::ProfileMarker::kGlobal)
.add_enum("process", profiler::ProfileMarker::kProcess)
.add_enum("thread", profiler::ProfileMarker::kThread)
.add_enum("task", profiler::ProfileMarker::kTask)
.add_enum("marker", profiler::ProfileMarker::kMarker)
.describe("Profile Instant-Marker scope.");
}
};
DMLC_REGISTER_PARAMETER(ProfileMarkerScopeParam);
int MXSetProcessProfilerConfig(int num_params,
const char* const* keys,
const char* const* vals,
KVStoreHandle kvstoreHandle) {
mxnet::IgnoreProfileCallScope ignore;
API_BEGIN();
std::vector<std::pair<std::string, std::string>> kwargs;
kwargs.reserve(num_params);
for (int i = 0; i < num_params; ++i) {
CHECK_NOTNULL(keys[i]);
CHECK_NOTNULL(vals[i]);
kwargs.emplace_back(std::make_pair(keys[i], vals[i]));
}
ProfileConfigParam param = {};
param.Init(kwargs);
if (static_cast<ProfileProcess>(param.profile_process) == ProfileProcess::kServer) {
std::ostringstream os;
for (int i = 0; i < num_params; ++i) {
// this will be sent to the server now, those configs shouldn't have profile server again
if (strcmp(keys[i], "profile_process") == 0)
continue;
os << keys[i] << ":" << vals[i];
if (i != num_params - 1)
os << ",";
}
CHECK(kvstoreHandle) << "KVStoreHandle passed to profiler is null";
static_cast<KVStore*>(kvstoreHandle)
->SetServerProfilerCommand(mxnet::KVStoreServerProfilerCommand::kSetConfig, os.str());
} else {
int mode = 0;
if (param.profile_api || param.profile_all) {
mode |= profiler::Profiler::kAPI;
}
if (param.profile_symbolic || param.profile_all) {
mode |= profiler::Profiler::kSymbolic;
}
if (param.profile_imperative || param.profile_all) {
mode |= profiler::Profiler::kImperative;
}
if (param.profile_memory || param.profile_all) {
mode |= profiler::Profiler::kMemory;
}
profiler::Profiler::Get()->SetConfig(profiler::Profiler::ProfilerMode(mode),
std::string(param.filename),
param.continuous_dump,
param.dump_period,
param.aggregate_stats);
#if MXNET_USE_CUDA
profiler::GpuDeviceStorageProfiler::Get()->SetConfig(param.gpu_memory_profile_filename_prefix);
#endif // MXNET_USE_CUDA
}
API_END();
}
int MXSetProfilerConfig(int num_params, const char* const* keys, const char* const* vals) {
return MXSetProcessProfilerConfig(num_params, keys, vals, nullptr);
}
int MXAggregateProfileStatsPrint(const char** out_str,
int reset,
int format,
int sort_by,
int ascending) {
MXAPIThreadLocalEntry<>* ret = MXAPIThreadLocalStore<>::Get();
API_BEGIN();
CHECK_NOTNULL(out_str);
profiler::Profiler* profiler = profiler::Profiler::Get();
if (profiler->IsEnableOutput()) {
// Register stats up until now
profiler->DumpProfile(false);
}
std::shared_ptr<profiler::AggregateStats> stats = profiler->GetAggregateStats();
std::ostringstream os;
if (stats) {
if (static_cast<PrintFormat>(format) == PrintFormat::table)
stats->DumpTable(os, sort_by, ascending);
else if (static_cast<PrintFormat>(format) == PrintFormat::json)
stats->DumpJson(os, sort_by, ascending);
else
LOG(FATAL) << "Invalid value for parameter format";
}
if (reset != 0)
stats->clear();
ret->ret_str = os.str();
*out_str = (ret->ret_str).c_str();
API_END();
}
int MXDumpProfile(int finished) {
return MXDumpProcessProfile(finished, static_cast<int>(ProfileProcess::kWorker), nullptr);
}
int MXDumpProcessProfile(int finished, int profile_process, KVStoreHandle kvStoreHandle) {
mxnet::IgnoreProfileCallScope ignore;
API_BEGIN();
if (static_cast<ProfileProcess>(profile_process) == ProfileProcess::kServer) {
CHECK(kvStoreHandle) << "Kvstore Handle passed to profiler is null";
static_cast<KVStore*>(kvStoreHandle)
->SetServerProfilerCommand(mxnet::KVStoreServerProfilerCommand::kDump,
std::to_string(finished));
} else {
profiler::Profiler* profiler = profiler::Profiler::Get();
CHECK(profiler->IsEnableOutput())
<< "Profiler hasn't been run. Config and start profiler first";
profiler->DumpProfile(finished != 0);
#if MXNET_USE_CUDA
profiler::GpuDeviceStorageProfiler::Get()->DumpProfile();
#endif // MXNET_USE_CUDA
}
API_END()
}
int MXSetProfilerState(int state) {
return MXSetProcessProfilerState(state, static_cast<int>(ProfileProcess::kWorker), nullptr);
}
int MXSetProfilerScope(const char* const scope) {
API_BEGIN();
profiler::ProfilerScope::Get()->SetCurrentProfilerScope(scope);
API_END();
}
int MXSetProcessProfilerState(int state, int profile_process, KVStoreHandle kvStoreHandle) {
mxnet::IgnoreProfileCallScope ignore;
// state, kNotRunning: 0, kRunning: 1
API_BEGIN();
if (static_cast<ProfileProcess>(profile_process) == ProfileProcess::kServer) {
CHECK(kvStoreHandle) << "Kvstore Handle passed to profiler is null";
static_cast<KVStore*>(kvStoreHandle)
->SetServerProfilerCommand(mxnet::KVStoreServerProfilerCommand::kState,
std::to_string(state));
} else {
switch (state) {
case profiler::Profiler::kNotRunning:
profiler::vtune::vtune_pause();
break;
case profiler::Profiler::kRunning:
profiler::vtune::vtune_resume();
break;
}
profiler::Profiler::Get()->SetState(profiler::Profiler::ProfilerState(state));
}
API_END();
}
int MXProfileCreateDomain(const char* domain, ProfileHandle* out) {
mxnet::IgnoreProfileCallScope ignore;
API_BEGIN();
auto dom = std::make_shared<profiler::ProfileDomain>(domain);
{
std::unique_lock<std::mutex> lock(python_profile_objects.cs_domains_);
python_profile_objects.domains_.push_back(dom);
}
*out = dom.get();
API_END();
}
int MXProfileCreateTask(ProfileHandle domain, const char* task_name, ProfileHandle* out) {
mxnet::IgnoreProfileCallScope ignore;
API_BEGIN();
auto ctr = std::make_shared<profiler::ProfileTask>(task_name,
static_cast<profiler::ProfileDomain*>(domain));
{
std::unique_lock<std::mutex> lock(python_profile_objects.cs_tasks_);
python_profile_objects.tasks_.emplace(std::make_pair(ctr.get(), ctr));
}
*out = ctr.get();
API_END();
}
int MXProfileCreateFrame(ProfileHandle domain, const char* frame_name, ProfileHandle* out) {
mxnet::IgnoreProfileCallScope ignore;
API_BEGIN();
auto ctr = std::make_shared<profiler::ProfileFrame>(
frame_name, static_cast<profiler::ProfileDomain*>(domain));
{
std::unique_lock<std::mutex> lock(python_profile_objects.cs_frames_);
python_profile_objects.frames_.emplace(std::make_pair(ctr.get(), ctr));
}
*out = ctr.get();
API_END();
}
int MXProfileCreateEvent(const char* event_name, ProfileHandle* out) {
mxnet::IgnoreProfileCallScope ignore;
API_BEGIN();
auto ctr = std::make_shared<profiler::ProfileEvent>(event_name);
{
std::unique_lock<std::mutex> lock(python_profile_objects.cs_events_);
python_profile_objects.events_.emplace(std::make_pair(ctr.get(), ctr));
}
*out = ctr.get();
API_END();
}
int MXProfileDestroyHandle(ProfileHandle object_handle) {
mxnet::IgnoreProfileCallScope ignore;
API_BEGIN();
CHECK_NE(object_handle, static_cast<ProfileHandle>(nullptr))
<< "Invalid NULL handle passed to MXProfileDestroyHandle";
std::shared_ptr<profiler::ProfileObject> shared_object_ptr(nullptr);
{
auto object = static_cast<profiler::ProfileObject*>(object_handle);
switch (object->type()) {
case profiler::kTask: {
auto p = static_cast<profiler::ProfileDuration*>(object_handle);
std::unique_lock<std::mutex> lock(python_profile_objects.cs_tasks_);
auto iter = python_profile_objects.tasks_.find(p);
if (iter != python_profile_objects.tasks_.end()) {
shared_object_ptr = iter->second;
python_profile_objects.tasks_.erase(iter);
}
break;
}
case profiler::kEvent: {
auto p = static_cast<profiler::ProfileDuration*>(object_handle);
std::unique_lock<std::mutex> lock(python_profile_objects.cs_events_);
auto iter = python_profile_objects.events_.find(p);
if (iter != python_profile_objects.events_.end()) {
shared_object_ptr = iter->second;
python_profile_objects.events_.erase(iter);
}
break;
}
case profiler::kFrame: {
auto p = static_cast<profiler::ProfileDuration*>(object_handle);
std::unique_lock<std::mutex> lock(python_profile_objects.cs_frames_);
auto iter = python_profile_objects.frames_.find(p);
if (iter != python_profile_objects.frames_.end()) {
shared_object_ptr = iter->second;
python_profile_objects.frames_.erase(iter);
}
break;
}
case profiler::kCounter: {
auto p = static_cast<profiler::ProfileCounter*>(object_handle);
std::unique_lock<std::mutex> lock(python_profile_objects.cs_counters_);
auto iter = python_profile_objects.counters_.find(p);
if (iter != python_profile_objects.counters_.end()) {
shared_object_ptr = iter->second;
python_profile_objects.counters_.erase(iter);
}
break;
}
case profiler::kDomain:
// Not destroyed
break;
}
}
shared_object_ptr.reset(); // Destroy out of lock scope
API_END();
}
int MXProfileDurationStart(ProfileHandle duration_handle) {
mxnet::IgnoreProfileCallScope ignore;
API_BEGIN();
CHECK_NOTNULL(duration_handle);
static_cast<profiler::ProfileDuration*>(duration_handle)->start();
API_END();
}
int MXProfileDurationStop(ProfileHandle duration_handle) {
mxnet::IgnoreProfileCallScope ignore;
API_BEGIN();
CHECK_NOTNULL(duration_handle);
static_cast<profiler::ProfileDuration*>(duration_handle)->stop();
API_END();
}
int MXProfilePause(int paused) {
return MXProcessProfilePause(paused, static_cast<int>(ProfileProcess::kWorker), nullptr);
}
int MXProcessProfilePause(int paused, int profile_process, KVStoreHandle kvStoreHandle) {
mxnet::IgnoreProfileCallScope ignore;
API_BEGIN();
if (static_cast<ProfileProcess>(profile_process) == ProfileProcess::kServer) {
CHECK(kvStoreHandle) << "Kvstore Handle passed to profiler is null";
static_cast<KVStore*>(kvStoreHandle)
->SetServerProfilerCommand(mxnet::KVStoreServerProfilerCommand::kPause,
std::to_string(paused));
} else {
if (paused) {
profiler::vtune::vtune_pause();
profiler::Profiler::Get()->set_paused(true);
} else {
profiler::Profiler::Get()->set_paused(false);
profiler::vtune::vtune_resume();
}
}
API_END();
}
int MXProfileCreateCounter(ProfileHandle domain, const char* counter_name, ProfileHandle* out) {
mxnet::IgnoreProfileCallScope ignore;
API_BEGIN();
auto ctr = std::make_shared<profiler::ProfileCounter>(
counter_name, static_cast<profiler::ProfileDomain*>(domain));
{
std::unique_lock<std::mutex> lock(python_profile_objects.cs_counters_);
python_profile_objects.counters_.emplace(std::make_pair(ctr.get(), ctr));
}
*out = ctr.get();
API_END();
}
int MXProfileSetCounter(ProfileHandle counter_handle, uint64_t value) {
mxnet::IgnoreProfileCallScope ignore;
API_BEGIN();
static_cast<profiler::ProfileCounter*>(counter_handle)->operator=(value);
API_END();
}
int MXProfileAdjustCounter(ProfileHandle counter_handle, int64_t by_value) {
mxnet::IgnoreProfileCallScope ignore;
API_BEGIN();
static_cast<profiler::ProfileCounter*>(counter_handle)->operator+=(by_value);
API_END();
}
int MXProfileSetMarker(ProfileHandle domain, const char* instant_marker_name, const char* scope) {
mxnet::IgnoreProfileCallScope ignore;
API_BEGIN();
ProfileMarkerScopeParam param;
std::vector<std::pair<std::string, std::string>> kwargs = {{"scope", scope}};
param.Init(kwargs);
profiler::ProfileMarker marker(instant_marker_name,
static_cast<profiler::ProfileDomain*>(domain),
static_cast<profiler::ProfileMarker::MarkerScope>(param.scope));
marker.mark();
API_END();
}