blob: 168441d1708df241455ed7f7f6b1a2720c321b6e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file src/runtime/profiling.cc
* \brief Runtime profiling including timers.
*/
#include <dmlc/json.h>
#include <tvm/ir/expr.h>
#include <tvm/runtime/c_backend_api.h>
#include <tvm/runtime/data_type.h>
#include <tvm/runtime/packed_func.h>
#include <tvm/runtime/profiling.h>
#include <tvm/runtime/threading_backend.h>
#include <algorithm>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <map>
#include <numeric>
#include <thread>
namespace tvm {
namespace runtime {
class DefaultTimerNode : public TimerNode {
public:
virtual void Start() {
TVMSynchronize(device_.device_type, device_.device_id, nullptr);
start_ = std::chrono::high_resolution_clock::now();
}
virtual void Stop() {
TVMSynchronize(device_.device_type, device_.device_id, nullptr);
duration_ = std::chrono::high_resolution_clock::now() - start_;
}
virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); }
virtual ~DefaultTimerNode() {}
explicit DefaultTimerNode(Device dev) : device_(dev) {}
static constexpr const char* _type_key = "DefaultTimerNode";
TVM_DECLARE_FINAL_OBJECT_INFO(DefaultTimerNode, TimerNode);
private:
std::chrono::high_resolution_clock::time_point start_;
std::chrono::duration<int64_t, std::nano> duration_;
Device device_;
};
TVM_REGISTER_OBJECT_TYPE(DefaultTimerNode);
TVM_REGISTER_OBJECT_TYPE(TimerNode);
Timer DefaultTimer(Device dev) { return Timer(make_object<DefaultTimerNode>(dev)); }
class CPUTimerNode : public TimerNode {
public:
virtual void Start() { start_ = std::chrono::high_resolution_clock::now(); }
virtual void Stop() { duration_ = std::chrono::high_resolution_clock::now() - start_; }
virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); }
virtual ~CPUTimerNode() {}
static constexpr const char* _type_key = "CPUTimerNode";
TVM_DECLARE_FINAL_OBJECT_INFO(CPUTimerNode, TimerNode);
private:
std::chrono::high_resolution_clock::time_point start_;
std::chrono::duration<int64_t, std::nano> duration_;
};
TVM_REGISTER_OBJECT_TYPE(CPUTimerNode);
TVM_REGISTER_GLOBAL("profiling.timer.cpu").set_body_typed([](Device dev) {
return Timer(make_object<CPUTimerNode>());
});
// keep track of which timers are not defined but we have already warned about
std::set<DLDeviceType> seen_devices;
std::mutex seen_devices_lock;
Timer Timer::Start(Device dev) {
auto f = Registry::Get(std::string("profiling.timer.") + DeviceName(dev.device_type));
if (f == nullptr) {
{
std::lock_guard<std::mutex> lock(seen_devices_lock);
if (seen_devices.find(dev.device_type) == seen_devices.end()) {
LOG(WARNING)
<< "No timer implementation for " << DeviceName(dev.device_type)
<< ", using default timer instead. It may be inaccurate or have extra overhead.";
seen_devices.insert(dev.device_type);
}
}
Timer t = DefaultTimer(dev);
t->Start();
return t;
} else {
Timer t = f->operator()(dev);
t->Start();
return t;
}
}
TVM_REGISTER_GLOBAL("profiling.start_timer").set_body_typed(Timer::Start);
namespace profiling {
Profiler::Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric_collectors,
std::unordered_map<String, ObjectRef> configuration)
: devs_(devs), collectors_(metric_collectors), configuration_(configuration) {
is_running_ = false;
std::vector<DeviceWrapper> wrapped_devs;
for (auto dev : devs) {
wrapped_devs.push_back(DeviceWrapper(make_object<DeviceWrapperNode>(dev)));
}
for (auto& x : collectors_) {
x->Init(wrapped_devs);
}
// reset the thread pool so that PAPI eventset hooks are set in all threads.
threading::ResetThreadPool();
configuration_[String("Number of threads")] =
ObjectRef(make_object<CountNode>(threading::NumThreads()));
}
void Profiler::Start() {
is_running_ = true;
for (auto dev : devs_) {
StartCall("Total", dev, {});
}
}
void Profiler::StartCall(String name, Device dev,
std::unordered_map<std::string, ObjectRef> extra_metrics) {
std::vector<std::pair<MetricCollector, ObjectRef>> objs;
for (auto& collector : collectors_) {
ObjectRef obj = collector->Start(dev);
if (obj.defined()) {
objs.emplace_back(collector, obj);
}
}
in_flight_.push(CallFrame{dev, name, Timer::Start(dev), extra_metrics, objs});
}
void Profiler::StopCall(std::unordered_map<std::string, ObjectRef> extra_metrics) {
CallFrame cf = in_flight_.top();
cf.timer->Stop();
for (auto& p : extra_metrics) {
cf.extra_metrics[p.first] = p.second;
}
// collect the extra metrics from user defined collectors
for (const auto& obj : cf.extra_collectors) {
auto collector_metrics = obj.first->Stop(obj.second);
for (auto& p : collector_metrics) {
cf.extra_metrics[p.first] = p.second;
}
}
in_flight_.pop();
calls_.push_back(cf);
}
void Profiler::Stop() {
is_running_ = false;
for (size_t i = 0; i < devs_.size(); i++) {
StopCall();
}
}
std::vector<int64_t> ToShape(NDArray shape_tensor) {
std::vector<int64_t> shape;
auto rank = shape_tensor.Shape().size();
auto dtype = shape_tensor.DataType();
// For 0-rank shapes we need to allocate a single scalar.
if (rank == 0) {
return shape;
}
// Otherwise we should be rank-1, and we will extract the number of dimensions
// for the output vector.
ICHECK_EQ(rank, 1U) << "shape tensor should be a k-length vector, found " << rank;
int64_t ndim = shape_tensor.Shape().at(0);
shape.resize(ndim);
const DLTensor* dl_tensor = shape_tensor.operator->();
if (dtype.is_int() && dtype.bits() == 32 && dtype.lanes() == 1) {
int32_t* dims = reinterpret_cast<int32_t*>(dl_tensor->data);
shape.assign(dims, dims + ndim);
} else if (dtype.is_int() && dtype.bits() == 64 && dtype.lanes() == 1) {
int64_t* dims = reinterpret_cast<int64_t*>(dl_tensor->data);
shape.assign(dims, dims + ndim);
} else {
LOG(FATAL) << "invalid shape tensor datatype: " << dtype;
}
return shape;
}
String ShapeString(NDArray shape, DLDataType dtype) { return ShapeString(ToShape(shape), dtype); }
String ShapeString(const std::vector<int64_t>& shape, DLDataType dtype) {
std::stringstream sizes;
sizes << dtype << "[";
for (size_t i = 0; i < shape.size(); i++) {
if (i != 0) {
sizes << ", ";
}
sizes << shape[i];
}
sizes << "]";
return String(sizes.str());
}
String ShapeString(const std::vector<NDArray>& shapes) {
std::stringstream sizes;
for (const NDArray& ary : shapes) {
if (sizes.tellp() > 0) {
sizes << ", ";
}
auto shape = ary.Shape();
sizes << ary.DataType() << "[";
for (size_t i = 0; i < shape.size(); i++) {
if (i != 0) {
sizes << ", ";
}
sizes << shape[i];
}
sizes << "]";
}
return String(sizes.str());
}
String ReportNode::AsCSV() const {
// get unique headers
std::set<std::string> unique_headers;
for (auto row : calls) {
for (auto p : row) {
unique_headers.insert(p.first);
}
}
std::vector<std::string> headers;
for (auto x : unique_headers) {
headers.push_back(x);
}
std::stringstream s;
for (size_t i = 0; i < headers.size(); i++) {
std::string header = headers[i];
s << header;
if (i < headers.size() - 1) {
s << ",";
}
}
s << std::endl;
for (auto row : calls) {
for (size_t i = 0; i < headers.size(); i++) {
std::string header = headers[i];
auto it = row.find(header);
if (it != row.end()) {
std::string val;
if ((*it).second.as<CountNode>()) {
s << (*it).second.as<CountNode>()->value;
} else if ((*it).second.as<DurationNode>()) {
s << (*it).second.as<DurationNode>()->microseconds;
} else if ((*it).second.as<PercentNode>()) {
s << (*it).second.as<PercentNode>()->percent;
} else if ((*it).second.as<RatioNode>()) {
s << (*it).second.as<RatioNode>()->ratio;
} else if ((*it).second.as<StringObj>()) {
s << "\"" << Downcast<String>((*it).second) << "\"";
}
}
if (i < headers.size() - 1) {
s << ",";
}
}
s << std::endl;
}
return s.str();
}
namespace {
void metric_as_json(std::ostream& os, ObjectRef o) {
if (o.as<StringObj>()) {
os << "{\"string\":"
<< "\"" << Downcast<String>(o) << "\""
<< "}";
} else if (const CountNode* n = o.as<CountNode>()) {
os << "{\"count\":" << n->value << "}";
} else if (const DurationNode* n = o.as<DurationNode>()) {
os << "{\"microseconds\":" << std::setprecision(std::numeric_limits<double>::max_digits10)
<< std::fixed << n->microseconds << "}";
} else if (const PercentNode* n = o.as<PercentNode>()) {
os << "{\"percent\":" << std::setprecision(std::numeric_limits<double>::max_digits10)
<< std::fixed << n->percent << "}";
} else if (const RatioNode* n = o.as<RatioNode>()) {
os << "{\"ratio\":" << std::setprecision(std::numeric_limits<double>::max_digits10)
<< std::fixed << n->ratio << "}";
} else {
LOG(FATAL) << "Unprintable type " << o->GetTypeKey();
}
}
} // namespace
String ReportNode::AsJSON() const {
std::ostringstream s;
// DMLC's JSONWriter does not allow us to write a key value pair without
// implementing Write for the value. We want a specific write for the value,
// so we would have to implement a custom data structure for each type of
// value we want to print. Instead we construct the json by hand because it
// is easier.
s << "{";
s << "\"calls\":[";
for (size_t i = 0; i < calls.size(); i++) {
size_t j = 0;
s << "{";
for (const auto& kv : calls[i]) {
s << "\"" << kv.first << "\":";
metric_as_json(s, kv.second);
if (j < calls[i].size() - 1) {
s << ",";
}
j++;
}
s << "}";
if (i < calls.size() - 1) {
s << ",";
}
}
s << "],"; // end calls
s << "\"device_metrics\":{";
size_t i = 0;
for (const auto& dev_kv : device_metrics) {
size_t j = 0;
s << "\"" << dev_kv.first << "\":{";
for (const auto& metric_kv : dev_kv.second) {
s << "\"" << metric_kv.first << "\":";
metric_as_json(s, metric_kv.second);
if (j < dev_kv.second.size() - 1) {
s << ",";
}
j++;
}
s << "}";
if (i < device_metrics.size() - 1) {
s << ",";
}
i++;
}
s << "},"; // end device metrics
s << "\"configuration\":{";
size_t k = 0;
for (const auto& kv : configuration) {
s << "\"" << kv.first << "\":";
metric_as_json(s, kv.second);
if (k < configuration.size() - 1) {
s << ",";
}
k++;
}
s << "}"; // end configuration
s << "}";
return s.str();
}
// Aggregate a set of values for a metric. Computes sum for Duration, Count,
// and Percent; average for Ratio; and assumes all Strings are the same. All
// ObjectRefs in metrics must have the same type.
ObjectRef AggregateMetric(const std::vector<ObjectRef>& metrics) {
ICHECK_GT(metrics.size(), 0) << "Must pass a non-zero number of metrics";
if (metrics[0].as<DurationNode>()) {
double sum = 0;
for (auto& metric : metrics) {
sum += metric.as<DurationNode>()->microseconds;
}
return ObjectRef(make_object<DurationNode>(sum));
} else if (metrics[0].as<CountNode>()) {
int64_t sum = 0;
for (auto& metric : metrics) {
sum += metric.as<CountNode>()->value;
}
return ObjectRef(make_object<CountNode>(sum));
} else if (metrics[0].as<PercentNode>()) {
double sum = 0;
for (auto& metric : metrics) {
sum += metric.as<PercentNode>()->percent;
}
return ObjectRef(make_object<PercentNode>(sum));
} else if (metrics[0].as<RatioNode>()) {
double sum = 0;
for (auto& metric : metrics) {
sum += metric.as<RatioNode>()->ratio;
}
return ObjectRef(make_object<RatioNode>(sum / metrics.size()));
} else if (metrics[0].as<StringObj>()) {
for (auto& m : metrics) {
if (Downcast<String>(metrics[0]) != Downcast<String>(m)) {
return ObjectRef(String(""));
}
}
// Assume all strings in metrics are the same.
return metrics[0];
} else {
LOG(FATAL) << "Can only aggregate metrics with types DurationNode, CountNode, "
"PercentNode, RatioNode, and StringObj, but got "
<< metrics[0]->GetTypeKey();
return ObjectRef(); // To silence warnings
}
}
// Try and set the locale of the provided stringstream so that it will print
// numbers with thousands separators. Sometimes users will have a misconfigured
// system where an invalid locale is set, so we catch and ignore any locale
// errors.
static void set_locale_for_separators(std::stringstream& s) {
try {
// empty string indicates locale should be the user's default, see man 3 setlocale
s.imbue(std::locale(""));
} catch (std::runtime_error& e) {
}
}
static String print_metric(ObjectRef metric) {
std::string val;
if (metric.as<CountNode>()) {
std::stringstream s;
set_locale_for_separators(s);
s << std::fixed << metric.as<CountNode>()->value;
val = s.str();
} else if (metric.as<DurationNode>()) {
std::stringstream s;
set_locale_for_separators(s);
s << std::fixed << std::setprecision(2) << metric.as<DurationNode>()->microseconds;
val = s.str();
} else if (metric.as<PercentNode>()) {
std::stringstream s;
s << std::fixed << std::setprecision(2) << metric.as<PercentNode>()->percent;
val = s.str();
} else if (metric.as<RatioNode>()) {
std::stringstream s;
set_locale_for_separators(s);
s << std::setprecision(2) << metric.as<RatioNode>()->ratio;
val = s.str();
} else if (metric.as<StringObj>()) {
val = Downcast<String>(metric);
} else {
LOG(FATAL) << "Cannot print metric of type " << metric->GetTypeKey();
}
return val;
}
String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) const {
// aggregate calls by op hash (or op name if hash is not set) + argument shapes
std::vector<Map<String, ObjectRef>> aggregated_calls;
if (aggregate) {
std::unordered_map<std::string, std::vector<size_t>> aggregates;
for (size_t i = 0; i < calls.size(); i++) {
auto& frame = calls[i];
auto it = frame.find("Hash");
std::string name = Downcast<String>(frame["Name"]);
if (it != frame.end()) {
name = Downcast<String>((*it).second);
}
if (frame.find("Argument Shapes") != frame.end()) {
name += Downcast<String>(frame["Argument Shapes"]);
}
if (frame.find("Device") != frame.end()) {
name += Downcast<String>(frame["Device"]);
}
if (aggregates.find(name) == aggregates.end()) {
aggregates[name] = {i};
} else {
aggregates[name].push_back(i);
}
}
for (const auto& p : aggregates) {
std::unordered_map<String, ObjectRef> aggregated;
std::unordered_set<std::string> metrics;
for (auto& call : calls) {
for (auto& metric : call) {
metrics.insert(metric.first);
}
}
for (const std::string& metric : metrics) {
std::vector<ObjectRef> per_call;
for (auto i : p.second) {
auto& call = calls[i];
auto it = std::find_if(call.begin(), call.end(),
[&metric](const std::pair<String, ObjectRef>& call_metric) {
return std::string(call_metric.first) == metric;
});
if (it != call.end()) {
per_call.push_back((*it).second);
}
}
if (per_call.size() > 0) {
aggregated[metric] = AggregateMetric(per_call);
}
}
aggregated_calls.push_back(aggregated);
}
} else {
for (auto call : calls) {
aggregated_calls.push_back(call);
}
}
// sort rows by duration
if (sort) {
std::sort(aggregated_calls.begin(), aggregated_calls.end(),
[&](const Map<String, ObjectRef>& a, const Map<String, ObjectRef>& b) {
return a.at("Duration (us)").as<DurationNode>()->microseconds >
b.at("Duration (us)").as<DurationNode>()->microseconds;
});
}
// compute columnwise sums
if (compute_col_sums) {
std::unordered_map<String, ObjectRef> col_sums;
for (auto call : aggregated_calls) {
for (auto p : call) {
if (p.second.as<CountNode>()) {
int64_t val = p.second.as<CountNode>()->value;
auto it = col_sums.find(p.first);
if (it != col_sums.end()) {
val += it->second.as<CountNode>()->value;
}
col_sums[p.first] = ObjectRef(make_object<CountNode>(val));
} else if (p.second.as<DurationNode>()) {
double val = p.second.as<DurationNode>()->microseconds;
auto it = col_sums.find(p.first);
if (it != col_sums.end()) {
val += it->second.as<DurationNode>()->microseconds;
}
col_sums[p.first] = ObjectRef(make_object<DurationNode>(val));
} else if (p.second.as<PercentNode>()) {
double val = p.second.as<PercentNode>()->percent;
auto it = col_sums.find(p.first);
if (it != col_sums.end()) {
val += it->second.as<PercentNode>()->percent;
}
col_sums[p.first] = ObjectRef(make_object<PercentNode>(val));
} else if (p.second.as<RatioNode>()) {
// It does not make sense to sum ratios
}
}
}
col_sums["Name"] = String("Sum");
aggregated_calls.push_back({{String("Name"), String("----------")}}); // separator
aggregated_calls.push_back(col_sums);
}
// per-device metrics
for (auto p : device_metrics) {
Map<String, ObjectRef> metrics = p.second;
metrics.Set("Name", String("Total"));
aggregated_calls.push_back(metrics);
}
// Table formatting
std::set<std::string> unique_headers;
for (auto row : aggregated_calls) {
for (auto p : row) {
unique_headers.insert(p.first);
}
}
// always include these headers in this order
std::vector<std::string> headers = {"Name", "Duration (us)", "Percent",
"Device", "Count", "Argument Shapes"};
for (auto header : unique_headers) {
if (std::find(headers.begin(), headers.end(), header) == headers.end()) {
headers.push_back(header);
}
}
// Switch layout from row major to column major so we can easily compute column widths.
std::vector<std::vector<std::string>> cols;
for (auto header : headers) {
cols.push_back({header});
}
for (auto row : aggregated_calls) {
for (size_t i = 0; i < headers.size(); i++) {
auto it = row.find(headers[i]);
if (it == row.end()) {
// fill empty data with empty strings
cols[i].push_back("");
} else {
cols[i].push_back(print_metric((*it).second));
}
}
}
std::vector<size_t> widths;
for (auto v : cols) {
size_t width = 0;
for (auto x : v) {
width = std::max(width, x.size());
}
widths.push_back(width);
}
size_t length = 0;
for (auto v : cols) {
length = std::max(length, v.size());
}
std::stringstream s;
for (size_t row = 0; row < length; row++) {
for (size_t col = 0; col < cols.size(); col++) {
// left align first column
if (col == 0) {
s << std::left;
} else {
s << std::right;
}
if (row < cols[col].size()) {
s << std::setw(widths[col]) << cols[col][row] << " ";
} else {
s << std::setw(widths[col]) << ""
<< " ";
}
}
s << std::endl;
}
// Add configuration information. It will not be aligned with the columns.
s << std::endl << "Configuration" << std::endl << "-------------" << std::endl;
for (auto kv : configuration) {
s << kv.first << ": " << print_metric(kv.second) << std::endl;
}
return s.str();
}
std::string DeviceString(Device dev) {
return DeviceName(dev.device_type) + std::to_string(dev.device_id);
}
Report Profiler::Report() {
// sync all timers and normalize rows
std::vector<std::unordered_map<String, ObjectRef>> rows;
for (auto& cf : calls_) {
std::unordered_map<String, ObjectRef> row;
double us = cf.timer->SyncAndGetElapsedNanos() / 1e3;
row["Duration (us)"] = ObjectRef(make_object<DurationNode>(us));
row["Count"] = ObjectRef(make_object<CountNode>(1));
row["Name"] = cf.name;
row["Device"] = String(DeviceString(cf.dev));
for (auto p : cf.extra_metrics) {
row[p.first] = p.second;
}
rows.push_back(row);
}
// the last couple of call frames are the overall times
double overall_time_us = 0;
std::unordered_map<String, Map<String, ObjectRef>> device_metrics;
for (size_t i = 0; i < devs_.size(); i++) {
auto row = rows[rows.size() - 1];
rows.pop_back();
device_metrics[Downcast<String>(row["Device"])] = row;
overall_time_us =
std::max(overall_time_us, row["Duration (us)"].as<DurationNode>()->microseconds);
}
// Calculate percentages
for (auto& row : rows) {
row["Percent"] = ObjectRef(make_object<PercentNode>(
row["Duration (us)"].as<DurationNode>()->microseconds / overall_time_us * 100));
}
// convert to map
std::vector<Map<String, ObjectRef>> converted_rows;
for (const auto& row : rows) {
converted_rows.push_back(row);
}
return profiling::Report(converted_rows, device_metrics, configuration_);
}
Report::Report(Array<Map<String, ObjectRef>> calls,
Map<String, Map<String, ObjectRef>> device_metrics,
Map<String, ObjectRef> configuration) {
auto node = make_object<ReportNode>();
node->calls = std::move(calls);
node->device_metrics = std::move(device_metrics);
node->configuration = std::move(configuration);
data_ = std::move(node);
}
Map<String, ObjectRef> parse_metrics(dmlc::JSONReader* reader) {
reader->BeginObject();
std::string metric_name, metric_value_name;
Map<String, ObjectRef> metrics;
while (reader->NextObjectItem(&metric_name)) {
ObjectRef o;
reader->BeginObject();
reader->NextObjectItem(&metric_value_name);
if (metric_value_name == "microseconds") {
double microseconds;
reader->Read(&microseconds);
o = ObjectRef(make_object<DurationNode>(microseconds));
} else if (metric_value_name == "percent") {
double percent;
reader->Read(&percent);
o = ObjectRef(make_object<PercentNode>(percent));
} else if (metric_value_name == "count") {
int64_t count;
reader->Read(&count);
o = ObjectRef(make_object<CountNode>(count));
} else if (metric_value_name == "ratio") {
double ratio;
reader->Read(&ratio);
o = ObjectRef(make_object<RatioNode>(ratio));
} else if (metric_value_name == "string") {
std::string s;
reader->Read(&s);
o = String(s);
} else {
LOG(FATAL) << "Cannot parse metric of type " << metric_value_name
<< " valid types are microseconds, percent, count.";
}
metrics.Set(metric_name, o);
// Necessary to make sure that the parser hits the end of the object.
ICHECK(!reader->NextObjectItem(&metric_value_name));
// EndObject does not exist, leaving this here for clarity
// reader.EndObject();
}
// reader.EndObject();
return metrics;
}
Report Report::FromJSON(String json) {
std::stringstream input(json.operator std::string());
dmlc::JSONReader reader(&input);
std::string key;
Array<Map<String, ObjectRef>> calls;
Map<String, Map<String, ObjectRef>> device_metrics;
Map<String, ObjectRef> configuration;
reader.BeginObject();
while (reader.NextObjectItem(&key)) {
if (key == "calls") {
reader.BeginArray();
while (reader.NextArrayItem()) {
calls.push_back(parse_metrics(&reader));
}
// reader.EndArray();
} else if (key == "device_metrics") {
reader.BeginObject();
std::string device_name;
while (reader.NextObjectItem(&device_name)) {
device_metrics.Set(device_name, parse_metrics(&reader));
}
// reader.EndObject();
} else if (key == "configuration") {
configuration = parse_metrics(&reader);
}
}
return Report(calls, device_metrics, configuration);
}
TVM_REGISTER_OBJECT_TYPE(DurationNode);
TVM_REGISTER_OBJECT_TYPE(PercentNode);
TVM_REGISTER_OBJECT_TYPE(CountNode);
TVM_REGISTER_OBJECT_TYPE(RatioNode);
TVM_REGISTER_OBJECT_TYPE(ReportNode);
TVM_REGISTER_OBJECT_TYPE(DeviceWrapperNode);
TVM_REGISTER_OBJECT_TYPE(MetricCollectorNode);
TVM_REGISTER_GLOBAL("runtime.profiling.AsTable").set_body_method<Report>(&ReportNode::AsTable);
TVM_REGISTER_GLOBAL("runtime.profiling.AsCSV").set_body_typed([](Report n) { return n->AsCSV(); });
TVM_REGISTER_GLOBAL("runtime.profiling.AsJSON").set_body_typed([](Report n) {
return n->AsJSON();
});
TVM_REGISTER_GLOBAL("runtime.profiling.FromJSON").set_body_typed(Report::FromJSON);
TVM_REGISTER_GLOBAL("runtime.profiling.DeviceWrapper").set_body_typed([](Device dev) {
return DeviceWrapper(dev);
});
PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, int device_id,
int warmup_iters, Array<MetricCollector> collectors) {
// Module::GetFunction is not const, so this lambda has to be mutable
return PackedFunc([=](TVMArgs args, TVMRetValue* ret) mutable {
PackedFunc f = mod.GetFunction(func_name);
CHECK(f.defined()) << "There is no function called \"" << func_name << "\" in the module";
Device dev{static_cast<DLDeviceType>(device_type), device_id};
// warmup
for (int i = 0; i < warmup_iters; i++) {
f.CallPacked(args, ret);
}
for (auto& collector : collectors) {
collector->Init({DeviceWrapper(dev)});
}
std::vector<Map<String, ObjectRef>> results;
results.reserve(collectors.size());
std::vector<std::pair<MetricCollector, ObjectRef>> collector_data;
collector_data.reserve(collectors.size());
for (auto& collector : collectors) {
ObjectRef o = collector->Start(dev);
// If not defined, then the collector cannot time this device.
if (o.defined()) {
collector_data.push_back({collector, o});
}
}
// TODO(tkonolige): repeated calls if the runtime is small?
f.CallPacked(args, ret);
for (auto& kv : collector_data) {
results.push_back(kv.first->Stop(kv.second));
}
Map<String, ObjectRef> combined_results;
for (auto m : results) {
for (auto p : m) {
// assume that there is no shared metric name between collectors
combined_results.Set(p.first, p.second);
}
}
*ret = combined_results;
});
}
TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction")
.set_body_typed<PackedFunc(Module, String, int, int, int,
Array<MetricCollector>)>([](Module mod, String func_name,
int device_type, int device_id,
int warmup_iters,
Array<MetricCollector> collectors) {
if (mod->type_key() == std::string("rpc")) {
LOG(FATAL)
<< "Profiling a module over RPC is not yet supported"; // because we can't send
// MetricCollectors over rpc.
throw;
} else {
return ProfileFunction(mod, func_name, device_type, device_id, warmup_iters, collectors);
}
});
PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, int min_repeat_ms,
int limit_zero_time_iterations, int cooldown_interval_ms,
int repeats_to_cooldown, PackedFunc f_preproc) {
ICHECK(pf != nullptr);
if (static_cast<int>(dev.device_type) == static_cast<int>(kDLMicroDev)) {
auto get_micro_time_evaluator = runtime::Registry::Get("micro._GetMicroTimeEvaluator");
ICHECK(get_micro_time_evaluator != nullptr) << "micro backend not enabled";
return (*get_micro_time_evaluator)(pf, dev, number, repeat);
}
auto ftimer = [pf, dev, number, repeat, min_repeat_ms, limit_zero_time_iterations,
cooldown_interval_ms, repeats_to_cooldown,
f_preproc](TVMArgs args, TVMRetValue* rv) mutable {
TVMRetValue temp;
std::ostringstream os;
// skip first time call, to activate lazy compilation components.
pf.CallPacked(args, &temp);
DeviceAPI::Get(dev)->StreamSync(dev, nullptr);
for (int i = 0; i < repeat; ++i) {
if (f_preproc != nullptr) {
f_preproc.CallPacked(args, &temp);
}
double duration_ms = 0.0;
int absolute_zero_times = 0;
do {
if (duration_ms > 0.0) {
const double golden_ratio = 1.618;
number = static_cast<int>(
std::max((min_repeat_ms / (duration_ms / number) + 1), number * golden_ratio));
}
// start timing
Timer t = Timer::Start(dev);
for (int j = 0; j < number; ++j) {
pf.CallPacked(args, &temp);
}
t->Stop();
int64_t t_nanos = t->SyncAndGetElapsedNanos();
if (t_nanos == 0) absolute_zero_times++;
duration_ms = t_nanos / 1e6;
} while (duration_ms < min_repeat_ms && absolute_zero_times < limit_zero_time_iterations);
double speed = duration_ms / 1e3 / number;
os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
if (cooldown_interval_ms > 0 && (i % repeats_to_cooldown) == 0) {
std::this_thread::sleep_for(std::chrono::milliseconds(cooldown_interval_ms));
}
}
std::string blob = os.str();
TVMByteArray arr;
arr.size = blob.length();
arr.data = blob.data();
// return the time.
*rv = arr;
};
return PackedFunc(ftimer);
}
TVM_REGISTER_GLOBAL("runtime.profiling.Report")
.set_body_typed([](Array<Map<String, ObjectRef>> calls,
Map<String, Map<String, ObjectRef>> device_metrics,
Map<String, ObjectRef> configuration) {
return Report(calls, device_metrics, configuration);
});
TVM_REGISTER_GLOBAL("runtime.profiling.Count").set_body_typed([](int64_t count) {
return ObjectRef(make_object<CountNode>(count));
});
TVM_REGISTER_GLOBAL("runtime.profiling.Percent").set_body_typed([](double percent) {
return ObjectRef(make_object<PercentNode>(percent));
});
TVM_REGISTER_GLOBAL("runtime.profiling.Duration").set_body_typed([](double duration) {
return ObjectRef(make_object<DurationNode>(duration));
});
TVM_REGISTER_GLOBAL("runtime.profiling.Ratio").set_body_typed([](double ratio) {
return ObjectRef(make_object<RatioNode>(ratio));
});
} // namespace profiling
} // namespace runtime
} // namespace tvm