blob: 53875b66b2da5c66ceaeefdbd1d21fad7a2fd8f0 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file profiler.cc
* \brief implements profiler
*/
#include <dmlc/base.h>
#include <dmlc/logging.h>
#include <dmlc/omp.h>
#include <mxnet/base.h>
#include <fstream>
#include <thread>
#include "./profiler.h"
#if MXNET_USE_CUDA
#include "../common/cuda/utils.h"
#endif
#if defined(_MSC_VER) && _MSC_VER <= 1800
#include <Windows.h>
#endif
namespace mxnet {
namespace profiler {
ProfileDomain ProfileOperator::domain_("operator");
Profiler::Profiler()
: state_(kNotRunning),
enable_output_(false),
filename_("profile.json"),
profile_dump_count_(0) {
this->init_time_ = ProfileStat::NowInMicrosec();
this->cpu_num_ = 0;
#ifdef _OPENMP
// OMP is going to know better than a generic stl, which might return 0
this->cpu_num_ = static_cast<unsigned int>(::omp_get_num_procs());
#else
this->cpu_num_ = std::thread::hardware_concurrency();
#endif
if (!this->cpu_num_) {
this->cpu_num_ = 64;
}
#if MXNET_USE_CUDA
int kMaxNumGpus = 32;
this->gpu_num_ = kMaxNumGpus;
#else
this->gpu_num_ = 0;
#endif
this->profile_stat = std::unique_ptr<DeviceStats[]>(new DeviceStats[cpu_num_ + gpu_num_ + 2]);
for (unsigned int i = 0; i < cpu_num_; ++i) {
this->profile_stat[i].dev_name_ = "cpu/" + std::to_string(i);
}
for (unsigned int i = 0; i < gpu_num_; ++i) {
this->profile_stat[cpu_num_ + i].dev_name_ = "gpu/" + std::to_string(i);
}
this->profile_stat[cpu_num_ + gpu_num_].dev_name_ = "cpu pinned/";
this->profile_stat[cpu_num_ + gpu_num_ + 1].dev_name_ = "cpu shared/";
this->mode_ = dmlc::GetEnv("MXNET_PROFILER_MODE", this->mode_);
if (dmlc::GetEnv("MXNET_PROFILER_AUTOSTART", 0)) {
this->state_ = ProfilerState::kRunning;
this->enable_output_ = true;
// Since we want to avoid interfering with pure-VTune analysis runs, for not set,
// vtune will be recording based upon whether "Start" or "STart Paused" was selected
vtune::vtune_resume();
}
}
Profiler::~Profiler() {
DumpProfile(true);
if (thread_group_) {
thread_group_->request_shutdown_all();
thread_group_->join_all();
thread_group_.reset();
}
}
Profiler* Profiler::Get(std::shared_ptr<Profiler>* sp) {
static std::mutex mtx;
static std::shared_ptr<Profiler> prof = nullptr;
if (!prof) {
std::unique_lock<std::mutex> lk(mtx);
if (!prof) {
prof = std::make_shared<Profiler>();
}
}
if (sp) {
*sp = prof;
}
return prof.get();
}
void Profiler::SetState(ProfilerState state) {
std::lock_guard<std::recursive_mutex> lock{this->m_};
this->state_ = state;
// once running, output will be enabled.
if (state == kRunning) {
this->enable_output_ = true;
set_paused(false);
} else {
set_paused(true);
}
}
void Profiler::SetConfig(int mode,
std::string output_filename,
bool continuous_dump,
float dump_period,
bool aggregate_stats) {
CHECK(!continuous_dump || dump_period > 0);
std::lock_guard<std::recursive_mutex> lock{this->m_};
this->mode_ = mode;
this->filename_ = output_filename;
// Remove the output file to start
if (!this->filename_.empty()) {
::unlink(this->filename_.c_str());
}
SetContinuousProfileDump(continuous_dump, dump_period);
// Adjust whether storing aggregate stats as necessary
if (aggregate_stats) {
if (!aggregate_stats_) {
aggregate_stats_ = std::make_shared<AggregateStats>();
}
} else if (aggregate_stats_) {
aggregate_stats_.reset();
}
}
/*
* Docs for tracing format:
* https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview
*/
void Profiler::EmitPid(std::ostream* os, const std::string& name, size_t pid) {
(*os) << " {\n"
<< R"( "ph": ")" << static_cast<char>(ProfileStat::kMetadata) << "\",\n"
<< " \"args\": {\n"
<< R"( "name": ")" << name << "\"\n"
<< " },\n"
<< " \"pid\": " << pid << ",\n"
<< " \"name\": \"process_name\"\n"
<< " }";
}
void Profiler::DumpProfile(bool perform_cleanup) {
std::lock_guard<std::recursive_mutex> lock{this->m_};
if (!IsEnableOutput()) {
return;
}
if (perform_cleanup) {
SetContinuousProfileDump(false, 1.0f);
}
std::ofstream file;
const bool first_pass = ++profile_dump_count_ == 1;
const bool last_pass = perform_cleanup || !continuous_dump_;
if (!first_pass && continuous_dump_) {
file.open(filename_, std::ios::app | std::ios::out);
} else {
file.open(filename_, std::ios::trunc | std::ios::out);
}
if (first_pass || !continuous_dump_) {
file << "{" << std::endl;
file << " \"traceEvents\": [" << std::endl;
}
const size_t dev_num = DeviceCount();
if (first_pass) {
for (uint32_t pid = 0; pid < dev_num; ++pid) {
if (pid) {
file << ",\n";
}
const DeviceStats& d = profile_stat[pid];
this->EmitPid(&file, d.dev_name_, pid);
process_ids_.emplace(pid);
}
}
// Hold ref in case SetConfig() resets aggregate_stats_
// If aggregate stats aren't enabled, this won't cause a locked instruction
std::shared_ptr<AggregateStats> ptr_aggregate_stats =
aggregate_stats_.get() ? aggregate_stats_ : nullptr;
for (uint32_t i = 0; i < dev_num; ++i) {
DeviceStats& d = profile_stat[i];
ProfileStat* _opr_stat;
while (d.opr_exec_stats_->try_dequeue(_opr_stat)) {
CHECK_NOTNULL(_opr_stat);
std::unique_ptr<ProfileStat> opr_stat(_opr_stat); // manage lifecycle
opr_stat->process_id_ = i; // lie and set process id to be the device number
file << ",\n" << std::endl;
opr_stat->EmitEvents(&file);
++num_records_emitted_;
if (ptr_aggregate_stats) {
ptr_aggregate_stats->OnProfileStat(*_opr_stat);
}
}
}
// Now do the non-device items
ProfileStat* _profile_stat;
while (general_stats_.opr_exec_stats_->try_dequeue(_profile_stat)) {
CHECK_NOTNULL(_profile_stat);
file << ",";
std::unique_ptr<ProfileStat> profile_stat(_profile_stat); // manage lifecycle
CHECK_NE(profile_stat->categories_.c_str()[0], '\0') << "Category must be set";
// Currently, category_to_pid_ is only accessed here, so it is protected by this->m_ above
auto iter = category_to_pid_.find(profile_stat->categories_.c_str());
if (iter == category_to_pid_.end()) {
static std::hash<std::string> hash_fn;
const size_t this_pid = hash_fn(profile_stat->categories_.c_str());
iter = category_to_pid_.emplace(std::make_pair(profile_stat->categories_.c_str(), this_pid))
.first;
EmitPid(&file, profile_stat->categories_.c_str(), iter->second);
file << ",\n";
}
profile_stat->process_id_ = iter->second;
file << std::endl;
profile_stat->EmitEvents(&file);
++num_records_emitted_;
if (ptr_aggregate_stats) {
ptr_aggregate_stats->OnProfileStat(*profile_stat);
}
}
if (last_pass) {
file << "\n" << std::endl;
file << " ]," << std::endl;
file << R"( "displayTimeUnit": "ms")" << std::endl;
file << "}" << std::endl;
}
enable_output_ = continuous_dump_ && !last_pass; // If we're appending, then continue.
// Otherwise, profiling stops.
}
static constexpr char TIMER_THREAD_NAME[] = "DumpProfileTimer";
void Profiler::SetContinuousProfileDump(bool continuous_dump, float delay_in_seconds) {
std::lock_guard<std::recursive_mutex> lock{this->m_};
if (continuous_dump) {
this->continuous_dump_ = true; // Continuous doesn't make sense without append mode
DumpProfile(false);
std::shared_ptr<dmlc::ThreadGroup::Thread> old_thread =
thread_group_->thread_by_name(TIMER_THREAD_NAME);
if (old_thread && old_thread->is_shutdown_requested()) {
// This should never happen unless someone is doing something malicious
// At any rate, wait for its shutdown to complete
if (old_thread->joinable()) {
old_thread->join();
} else {
do {
std::this_thread::sleep_for(std::chrono::milliseconds(10));
} while (thread_group_->thread_by_name(TIMER_THREAD_NAME));
}
old_thread.reset();
}
if (!old_thread) {
dmlc::CreateTimer(TIMER_THREAD_NAME,
std::chrono::milliseconds(static_cast<size_t>(delay_in_seconds * 1000.0f)),
thread_group_.get(),
[this]() -> int {
DumpProfile(false);
return 0;
});
}
} else {
std::shared_ptr<dmlc::ThreadGroup::Thread> old_thread =
thread_group_->thread_by_name(TIMER_THREAD_NAME);
if (old_thread) {
// Signal it to finish asynchronously
old_thread->request_shutdown();
}
}
}
ProfilerScope* ProfilerScope::Get() {
static std::mutex mtx;
static std::shared_ptr<ProfilerScope> prof_scope = nullptr;
std::unique_lock<std::mutex> lk(mtx);
if (!prof_scope) {
prof_scope = std::make_shared<ProfilerScope>();
}
return prof_scope.get();
}
void ProfilerScope::SetCurrentProfilerScope(const std::string& scope) {
current_profiler_scope_ = scope;
}
std::string ProfilerScope::GetCurrentProfilerScope() const {
return current_profiler_scope_;
}
} // namespace profiler
} // namespace mxnet