blob: 2259c85dee51629859bcfa1f402a29e4b92c02a7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file cudnn_cxx.cc
*/
#include "cudnn_cxx.h"
#include <mxnet/base.h>
#if MXNET_USE_CUDNN == 1
#include <mxnet/storage.h>
#include <algorithm>
#include <sstream>
#include <utility>
namespace mxnet {
namespace cudnn_cxx {
Descriptor Make(cudnnBackendDescriptorType_t type) {
cudnnBackendDescriptor_t desc{};
CUDNN_CALL(cudnnBackendCreateDescriptor(type, &desc));
return Descriptor(desc);
}
std::vector<cudnnBackendDescriptor_t> MakeRawDescriptors(size_t n,
cudnnBackendDescriptorType_t type) {
std::vector<cudnnBackendDescriptor_t> ret(n);
for (auto& d : ret)
CUDNN_CALL(cudnnBackendCreateDescriptor(type, &d));
return ret;
}
void SetAttr(const Descriptor& desc, cudnnBackendAttributeName_t name, const Descriptor& val) {
auto raw = val.get();
CUDNN_CALL(cudnnBackendSetAttribute(desc.get(), name, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &raw));
}
void SetAttr(const Descriptor& desc, cudnnBackendAttributeName_t name, const WeakDescriptor& val) {
auto raw = val.get();
CUDNN_CALL(cudnnBackendSetAttribute(desc.get(), name, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &raw));
}
void SetAttr(const Descriptor& desc,
cudnnBackendAttributeName_t name,
const std::vector<Descriptor>& val) {
std::vector<cudnnBackendDescriptor_t> raw(val.size());
std::transform(val.begin(), val.end(), raw.begin(), [](const Descriptor& d) { return d.get(); });
CUDNN_CALL(cudnnBackendSetAttribute(
desc.get(), name, CUDNN_TYPE_BACKEND_DESCRIPTOR, raw.size(), &raw[0]));
}
Descriptor GetAttr(const Descriptor& desc,
cudnnBackendAttributeName_t name,
cudnnBackendDescriptorType_t type) {
cudnnBackendDescriptor_t ret{};
CUDNN_CALL(cudnnBackendCreateDescriptor(type, &ret));
int64_t count = 0;
CUDNN_CALL(
cudnnBackendGetAttribute(desc.get(), name, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &count, &ret));
CHECK_EQ(count, 1);
return Descriptor(ret);
}
std::vector<Descriptor> GetAllAttrs(const Descriptor& desc,
cudnnBackendAttributeName_t name,
cudnnBackendDescriptorType_t type) {
int64_t count = 0;
CUDNN_CALL(cudnnBackendGetAttribute(
desc.get(), name, CUDNN_TYPE_BACKEND_DESCRIPTOR, 0, &count, nullptr));
auto raw = MakeRawDescriptors(count, type);
CUDNN_CALL(cudnnBackendGetAttribute(
desc.get(), name, CUDNN_TYPE_BACKEND_DESCRIPTOR, raw.size(), &count, raw.data()));
CHECK_LE(count, raw.size());
std::vector<Descriptor> ret(raw.begin(), raw.begin() + count);
for (size_t i = count; i < raw.size(); ++i)
CUDNN_CALL(cudnnBackendDestroyDescriptor(raw[i]));
return ret;
}
std::vector<Descriptor> GetSomeAttrs(size_t max_n,
const Descriptor& desc,
cudnnBackendAttributeName_t name,
cudnnBackendDescriptorType_t type) {
auto raw = MakeRawDescriptors(max_n, type);
int64_t count = 0;
CUDNN_CALL(cudnnBackendGetAttribute(
desc.get(), name, CUDNN_TYPE_BACKEND_DESCRIPTOR, raw.size(), &count, raw.data()));
std::vector<Descriptor> ret(count);
size_t i = 0;
for (; i < count; ++i)
ret[i] = Descriptor(raw[i]);
for (; i < max_n; ++i)
CUDNN_CALL(cudnnBackendDestroyDescriptor(raw[i]));
return ret;
}
std::vector<Descriptor> GetPlans(cudnnBackendHeurMode_t h_mode,
cudnnHandle_t handle,
const Descriptor& op_graph,
size_t workspace_limit,
size_t* max_workspace,
const std::unordered_set<int64_t>& excl_engines,
const std::vector<cudnnBackendNumericalNote_t>& req_numeric,
const std::vector<cudnnBackendNumericalNote_t>& excl_numeric,
#if CUDNN_VERSION >= 8200
const std::vector<cudnnBackendBehaviorNote_t>& req_behavior,
const std::vector<cudnnBackendBehaviorNote_t>& excl_behavior,
#endif // CUDNN_VERSION >= 8200
bool verbose_filter) {
auto heur = MakeFinalized(CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR,
CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH,
op_graph,
CUDNN_ATTR_ENGINEHEUR_MODE,
h_mode);
auto cfgs = GetAllAttrs(heur, CUDNN_ATTR_ENGINEHEUR_RESULTS, CUDNN_BACKEND_ENGINECFG_DESCRIPTOR);
std::vector<Descriptor> plans;
if (max_workspace)
*max_workspace = 0;
for (const auto& cfg : cfgs) {
auto plan = Make(CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR,
CUDNN_ATTR_EXECUTION_PLAN_HANDLE,
handle,
CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG,
cfg);
auto err = cudnnBackendFinalize(plan.get());
if (err == CUDNN_STATUS_NOT_SUPPORTED || err == CUDNN_STATUS_ARCH_MISMATCH)
continue;
if (err != CUDNN_STATUS_SUCCESS) {
LOG(WARNING) << "Unexpected cuDNN status: " << err << ": " << cudnnGetErrorString(err);
continue;
}
auto workspace = GetAttr<int64_t>(plan, CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE);
if (workspace_limit < workspace) {
if (verbose_filter)
LOG(INFO) << " Plan " << PlanStr(plan) << " exceeds workspace limit";
continue;
}
auto engine = GetAttr(cfg, CUDNN_ATTR_ENGINECFG_ENGINE, CUDNN_BACKEND_ENGINE_DESCRIPTOR);
if (excl_engines.count(GetAttr<int64_t>(engine, CUDNN_ATTR_ENGINE_GLOBAL_INDEX))) {
if (verbose_filter)
LOG(INFO) << " Plan " << PlanStr(plan) << " excluded by engine";
continue;
}
auto numerical = GetSomeAttrs<cudnnBackendNumericalNote_t>(
CUDNN_NUMERICAL_NOTE_TYPE_COUNT, engine, CUDNN_ATTR_ENGINE_NUMERICAL_NOTE);
if (!IsCompatible(numerical, req_numeric, excl_numeric)) {
if (verbose_filter)
LOG(INFO) << " Plan " << PlanStr(plan) << " has incompatible numerics";
continue;
}
#if CUDNN_VERSION >= 8200
auto behavior = GetSomeAttrs<cudnnBackendBehaviorNote_t>(
CUDNN_BEHAVIOR_NOTE_TYPE_COUNT, engine, CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE);
if (!IsCompatible(behavior, req_behavior, excl_behavior)) {
if (verbose_filter)
LOG(INFO) << " Plan " << PlanStr(plan) << " has incompatible behavior";
continue;
}
#endif // CUDNN_VERSION >= 8200
plans.push_back(std::move(plan));
if (max_workspace)
*max_workspace = std::max(*max_workspace, static_cast<size_t>(workspace));
}
return plans;
}
#if !defined(__CUDACC__) // Can be removed when CUDA 10 support is dropped.
Sampler MakeAvgSampler(size_t n, float max_cutoff_msec, size_t warmups) {
size_t warmups_performed = 0;
size_t k = 0;
float s = 0.0f;
if (n < 1)
n = 1;
return [n, max_cutoff_msec, warmups, warmups_performed, k, s](float x) mutable {
if (warmups_performed < warmups && x < max_cutoff_msec) {
warmups_performed++;
} else {
// Add this sample to the average calculation
s += x;
k++;
}
bool keep_going = k < n && x < max_cutoff_msec;
return keep_going ? std::nullopt : std::optional(s / k);
};
}
std::vector<FindResult> FindTopPlans(std::vector<Descriptor>&& plans,
size_t max_results,
cudnnHandle_t handle,
const Descriptor& var_pack,
Sampler sampler) {
// We're about to perform kernel timings, so we need to quiet the system by grabbing
// the Storage lock. Concurrent cudaMalloc's can disrupt the accurate timing
// measurements of the algos, and can prevent the cuda driver's proper freeing
// of temporary workspace allocations. Grabbing the lock might also
// impede other threads from launching work on the GPU.
std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
std::array<cudaEvent_t, 2> ev;
for (auto& ee : ev)
CUDA_CALL(cudaEventCreate(&ee));
auto cmp = [](const FindResult& lhs, const FindResult& rhs) { return lhs.time < rhs.time; };
cudaStream_t stream{};
CUDNN_CALL(cudnnGetStream(handle, &stream));
std::vector<FindResult> h;
for (size_t i = 0; i < plans.size(); ++i) {
auto&& plan = plans[i];
// Make a copy of the unused sampler for each plan's timing. Timed warm-up
// runs are handled by the sampler to enable early loop exit for slow kernels.
auto sampler_copy = sampler;
for (;;) {
CUDA_CALL(cudaEventRecord(ev[0], stream));
CUDNN_CALL(cudnnBackendExecute(handle, plan.get(), var_pack.get()));
CUDA_CALL(cudaEventRecord(ev[1], stream));
CUDA_CALL(cudaEventSynchronize(ev[1]));
float t = 0.0f;
CUDA_CALL(cudaEventElapsedTime(&t, ev[0], ev[1]));
if (auto r = sampler_copy(t); r) {
auto time_to_record = r.value();
if (h.size() == max_results) {
if (time_to_record < h[0].time) {
std::pop_heap(h.begin(), h.end(), cmp);
h.back() = {std::move(plan), i, time_to_record};
std::push_heap(h.begin(), h.end(), cmp);
}
} else {
h.push_back({std::move(plan), i, time_to_record});
std::push_heap(h.begin(), h.end(), cmp);
}
break;
}
}
}
for (auto& ee : ev)
CUDA_CALL(cudaEventDestroy(ee));
std::sort_heap(h.begin(), h.end(), cmp);
return h;
}
#endif // !defined(__CUDACC__)
std::string NoteStr(cudnnBackendNumericalNote_t note) {
std::unordered_map<cudnnBackendNumericalNote_t, std::string> m{
{CUDNN_NUMERICAL_NOTE_TENSOR_CORE, "tc"},
{CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS, "dci"},
{CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION, "rp"},
{CUDNN_NUMERICAL_NOTE_FFT, "fft"},
{CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC, "nd"},
{CUDNN_NUMERICAL_NOTE_WINOGRAD, "w"},
};
auto it = m.find(note);
return it != m.end() ? it->second : std::to_string(note);
}
std::string KnobStr(cudnnBackendKnobType_t knob) {
std::unordered_map<cudnnBackendKnobType_t, std::string> m {
{CUDNN_KNOB_TYPE_SPLIT_K, "split_k"}, {CUDNN_KNOB_TYPE_SWIZZLE, "swizzle"},
{CUDNN_KNOB_TYPE_TILE_SIZE, "tile_size"}, {CUDNN_KNOB_TYPE_USE_TEX, "use_tex"},
{CUDNN_KNOB_TYPE_EDGE, "edge"}, {CUDNN_KNOB_TYPE_KBLOCK, "kblock"},
{CUDNN_KNOB_TYPE_LDGA, "ldga"}, {CUDNN_KNOB_TYPE_LDGB, "ldgb"},
{CUDNN_KNOB_TYPE_CHUNK_K, "chunk_k"}, {CUDNN_KNOB_TYPE_SPLIT_H, "split_h"},
{CUDNN_KNOB_TYPE_WINO_TILE, "wino_tile"}, {CUDNN_KNOB_TYPE_MULTIPLY, "multiply"},
{CUDNN_KNOB_TYPE_SPLIT_K_BUF, "split_k_buf"}, {CUDNN_KNOB_TYPE_TILEK, "tilek"},
{CUDNN_KNOB_TYPE_STAGES, "stages"}, {CUDNN_KNOB_TYPE_REDUCTION_MODE, "reduction_mode"},
{CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE, "cta_split_k_mode"},
{CUDNN_KNOB_TYPE_SPLIT_K_SLC, "split_k_slc"}, {CUDNN_KNOB_TYPE_IDX_MODE, "idx_mode"},
{CUDNN_KNOB_TYPE_SLICED, "sliced"}, {CUDNN_KNOB_TYPE_SPLIT_RS, "split_rs"},
{CUDNN_KNOB_TYPE_SINGLEBUFFER, "singlebuffer"}, {CUDNN_KNOB_TYPE_LDGC, "ldgc"},
{CUDNN_KNOB_TYPE_SPECFILT, "specfilt"},
#if CUDNN_VERSION >= 8100
{CUDNN_KNOB_TYPE_KERNEL_CFG, "kernel_cfg"},
#endif // CUDNN_VERSION >= 8100
};
auto it = m.find(knob);
return it != m.end() ? it->second : std::to_string(knob);
}
std::string PlanStr(const Descriptor& plan) {
auto wks = GetAttr<int64_t>(plan, CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE);
auto cfg =
GetAttr(plan, CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG, CUDNN_BACKEND_ENGINECFG_DESCRIPTOR);
auto engine = GetAttr(cfg, CUDNN_ATTR_ENGINECFG_ENGINE, CUDNN_BACKEND_ENGINE_DESCRIPTOR);
auto engine_idx = GetAttr<int64_t>(engine, CUDNN_ATTR_ENGINE_GLOBAL_INDEX);
std::ostringstream ss;
ss << "eng:" << engine_idx << " wksp:" << wks;
auto notes = GetSomeAttrs<cudnnBackendNumericalNote_t>(
CUDNN_NUMERICAL_NOTE_TYPE_COUNT, engine, CUDNN_ATTR_ENGINE_NUMERICAL_NOTE);
for (auto note : notes)
ss << " " << NoteStr(note);
auto choices = GetSomeAttrs(CUDNN_KNOB_TYPE_COUNTS,
cfg,
CUDNN_ATTR_ENGINECFG_KNOB_CHOICES,
CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR);
for (const auto& choice : choices) {
auto type = GetAttr<cudnnBackendKnobType_t>(choice, CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE);
auto val = GetAttr<int64_t>(choice, CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE);
ss << " " << KnobStr(type) << ":" << val;
}
return ss.str();
}
} // namespace cudnn_cxx
} // namespace mxnet
#endif // MXNET_USE_CUDNN == 1