blob: 1244fddf09830b60215818d61e1848b14fe88fd2 [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
* \file
#include <dmlc/parameter.h>
#include <dmlc/thread_local.h>
#include <tvm/runtime/profiling.h>
#include <tvm/runtime/registry.h>
#include <sstream>
#include "opencl_common.h"
namespace tvm {
namespace runtime {
namespace cl {
std::string GetPlatformInfo(cl_platform_id pid, cl_platform_info param_name);
std::string GetDeviceInfo(cl_device_id pid, cl_device_info param_name);
std::string GetOpenCLVersion(cl_device_id pid);
struct ImageInfo {
size_t origin[3] = {};
size_t region[3] = {};
size_t row_pitch = 0;
size_t slice_pitch = 0;
* \brief Utility to apply a memory layout specific lowering convention
* to infer the physical shape from the provided DLTensor's logical shape.
* \param desc Descriptor which contains the buffer and layout tag.
* \param The DLTensor used to infer the tensors physical shape.
ImageInfo GetImageInfo(const cl::BufferDescriptor* desc, const DLTensor* tensor) {
ImageInfo info{};
ICHECK(tensor->dtype.lanes == 1) << "Image dtype has lanes: " << tensor->dtype.lanes;
info.origin[0] = info.origin[1] = info.origin[2] = 0;
info.row_pitch = 0;
info.slice_pitch = 0;
size_t axis = DefaultTextureLayoutSeparator(
tensor->ndim, cl::BufferDescriptor::ScopeFromMemoryLayout(desc->layout));
auto texture_shape = ApplyTexture2DFlattening<int64_t>(tensor->shape, tensor->ndim, axis);
info.region[0] = texture_shape.width;
info.region[1] = texture_shape.height;
info.region[2] = 1;
return info;
cl::BufferDescriptor::MemoryLayout cl::BufferDescriptor::MemoryLayoutFromScope(
Optional<String> mem_scope) {
if (!mem_scope.defined()) {
return cl::BufferDescriptor::MemoryLayout::kBuffer1D;
} else if (mem_scope.value() == "global.texture") {
return cl::BufferDescriptor::MemoryLayout::kImage2DActivation;
} else if (mem_scope.value() == "global.texture-weight") {
return cl::BufferDescriptor::MemoryLayout::kImage2DWeight;
} else if (mem_scope.value() == "global.texture-nhwc") {
return cl::BufferDescriptor::MemoryLayout::kImage2DNHWC;
LOG(FATAL) << "No memory layout defined for memory of scope: " << mem_scope.value();
String cl::BufferDescriptor::ScopeFromMemoryLayout(cl::BufferDescriptor::MemoryLayout layout) {
switch (layout) {
case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
return "global";
case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
return "global.texture";
case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
return "global.texture-weight";
case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC:
return "global.texture-nhwc";
LOG(FATAL) << "No scope corresponding to the provided memory layout: "
<< static_cast<int>(layout);
return "";
OpenCLThreadEntry* OpenCLWorkspace::GetThreadEntry() { return OpenCLThreadEntry::ThreadLocal(); }
OpenCLWorkspace* OpenCLWorkspace::Global() {
static OpenCLWorkspace* inst = new OpenCLWorkspace();
return inst;
void OpenCLWorkspace::SetDevice(Device dev) { GetThreadEntry()->device.device_id = dev.device_id; }
void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) {
size_t index = static_cast<size_t>(dev.device_id);
if (kind == kExist) {
*rv = static_cast<int>(index < devices.size());
ICHECK_LT(index, devices.size()) << "Invalid device id " << index << ". " << GetError();
switch (kind) {
case kExist:
case kMaxThreadsPerBlock: {
size_t value;
OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t),
&value, nullptr));
*rv = static_cast<int64_t>(value);
case kWarpSize: {
/* TODO: the warp size of OpenCL device is not always 1
e.g. Intel Graphics has a sub group concept which contains 8 - 32 work items,
corresponding to the number of SIMD entries the heardware configures.
We need to figure out a way to query this information from the hardware.
const int warp_size = dmlc::GetEnv("TVM_OPENCL_WARP_SIZE", 1);
*rv = warp_size;
case kMaxSharedMemoryPerBlock: {
cl_ulong value;
OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong),
&value, nullptr));
*rv = static_cast<int64_t>(value);
case kComputeVersion:
*rv = GetOpenCLVersion(devices[index]);
case kDeviceName:
*rv = GetDeviceInfo(devices[index], CL_DEVICE_NAME);
case kMaxClockRate: {
cl_uint value;
OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint),
&value, nullptr));
// OpenCL returns the clock rate in MHz, while CUDA/ROCm return the
// clock rate in kHz. Converting to the same units for each.
*rv = static_cast<int32_t>(value * 1000);
case kMultiProcessorCount: {
cl_uint value;
OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
&value, nullptr));
*rv = static_cast<int32_t>(value);
case kMaxThreadDimensions: {
size_t dims[3];
OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(dims), dims,
std::stringstream ss; // use json string to return multiple int values;
ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]";
*rv = ss.str();
case kMaxRegistersPerBlock:
case kGcnArch:
case kApiVersion: {
case kDriverVersion: {
char value[128] = {0};
clGetDeviceInfo(devices[index], CL_DRIVER_VERSION, sizeof(value) - 1, value, nullptr));
*rv = std::string(value);
void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,
DLDataType type_hint) {
ICHECK(context != nullptr) << "No OpenCL device. " << GetError();
cl_int err_code;
cl::BufferDescriptor* desc = new cl::BufferDescriptor;
// CL_INVALID_BUFFER_SIZE if size is 0.
if (size == 0) {
size = 1;
desc->buffer = clCreateBuffer(this->context, CL_MEM_READ_WRITE, size, nullptr, &err_code);
desc->layout = cl::BufferDescriptor::MemoryLayout::kBuffer1D;
return desc;
void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
Optional<String> mem_scope) {
if (!mem_scope.defined() || mem_scope.value() == "global") {
return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
<< "Device does not support allocate data space with "
<< "specified memory scope: " << mem_scope.value();
ICHECK(ndim > 2) << "Shape for texture allocation must be at least rank 3; "
<< "provided shape is rank " << ndim;
cl::BufferDescriptor* desc = new cl::BufferDescriptor(mem_scope);
size_t axis = DefaultTextureLayoutSeparator(ndim, mem_scope.value());
auto texture = ApplyTexture2DFlattening<int64_t>(shape, ndim, axis);
desc->buffer = AllocTexture(dev, texture.width, texture.height, dtype);
return desc;
void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
// We have to make sure that the memory object is not in the command queue
// for some OpenCL platforms.
cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(ptr);
delete desc;
cl_mem OpenCLWorkspace::AllocTexture(Device dev, size_t width, size_t height,
DLDataType type_hint) {
ICHECK(context != nullptr) << "No OpenCL device. " << GetError();
cl_int err_code;
cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint);
cl_image_format format = {CL_RGBA, cl_type};
cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 0, 0, 0};
cl_mem mptr =
clCreateImage(this->context, CL_MEM_READ_WRITE, &format, &descriptor, nullptr, &err_code);
return mptr;
void* OpenCLWorkspace::AllocTextureWorkspace(Device dev, size_t width, size_t height,
DLDataType type_hint) {
return GetThreadEntry()->texture_pool.AllocTexture(dev, width, height, type_hint);
void OpenCLWorkspace::FreeTextureWorkspace(Device dev, void* ptr) {
GetThreadEntry()->texture_pool.FreeTexture(dev, ptr);
void OpenCLWorkspace::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
size_t nbytes = GetDataSize(*from);
ICHECK_EQ(nbytes, GetDataSize(*to));
ICHECK(IsContiguous(*from) && IsContiguous(*to))
<< "CopyDataFromTo only support contiguous array for now";
if (IsOpenCLDevice(from->device) && IsOpenCLDevice(to->device)) {
const auto* from_desc = static_cast<const cl::BufferDescriptor*>(from->data);
auto* to_desc = static_cast<cl::BufferDescriptor*>(to->data);
if (to_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D &&
from_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(to->device), from_desc->buffer,
to_desc->buffer, from->byte_offset, to->byte_offset, nbytes,
0, nullptr, nullptr));
} else if (to_desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D &&
from_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
auto image_info = GetImageInfo(to_desc, to);
OPENCL_CALL(clEnqueueCopyBufferToImage(this->GetQueue(to->device), from_desc->buffer,
to_desc->buffer, from->byte_offset, image_info.origin,
image_info.region, 0, nullptr, nullptr));
} else if (to_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D &&
from_desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
auto image_info = GetImageInfo(from_desc, from);
OPENCL_CALL(clEnqueueCopyImageToBuffer(this->GetQueue(to->device), from_desc->buffer,
to_desc->buffer, image_info.origin, image_info.region,
to->byte_offset, 0, nullptr, nullptr));
} else {
auto to_image_info = GetImageInfo(to_desc, to);
auto from_image_info = GetImageInfo(from_desc, from);
OPENCL_CALL(clEnqueueCopyImage(this->GetQueue(to->device), from_desc->buffer, to_desc->buffer,
from_image_info.origin, to_image_info.origin,
to_image_info.region, 0, nullptr, nullptr));
} else if (IsOpenCLDevice(from->device) && to->device.device_type == kDLCPU) {
const auto* from_desc = static_cast<const cl::BufferDescriptor*>(from->data);
switch (from_desc->layout) {
case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
this->GetQueue(from->device), from_desc->buffer, CL_FALSE, from->byte_offset, nbytes,
static_cast<char*>(to->data) + to->byte_offset, 0, nullptr, nullptr));
case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC:
auto image_info = GetImageInfo(from_desc, from);
// TODO(csullivan): Support calculating row_pitch correctly in the case of reuse.
// Note that when utilizing texture pools for memory reuse, the allocated image
// size can be larger than the size to be read.
this->GetQueue(from->device), from_desc->buffer, CL_FALSE, image_info.origin,
image_info.region, image_info.row_pitch, image_info.slice_pitch,
static_cast<char*>(to->data) + to->byte_offset, 0, nullptr, nullptr));
} else if (from->device.device_type == kDLCPU && IsOpenCLDevice(to->device)) {
auto* to_desc = static_cast<cl::BufferDescriptor*>(to->data);
switch (to_desc->layout) {
case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
this->GetQueue(to->device), to_desc->buffer, CL_FALSE, to->byte_offset, nbytes,
static_cast<const char*>(from->data) + from->byte_offset, 0, nullptr, nullptr));
case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC:
auto image_info = GetImageInfo(to_desc, to);
this->GetQueue(to->device), to_desc->buffer, CL_FALSE, image_info.origin,
image_info.region, image_info.row_pitch, image_info.slice_pitch,
static_cast<const char*>(from->data) + from->byte_offset, 0, nullptr, nullptr));
} else {
LOG(FATAL) << "Expect copy from/to OpenCL or between OpenCL";
void OpenCLWorkspace::StreamSync(Device dev, TVMStreamHandle stream) {
ICHECK(stream == nullptr);
void* OpenCLWorkspace::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
return GetThreadEntry()->pool.AllocWorkspace(dev, size);
void OpenCLWorkspace::FreeWorkspace(Device dev, void* data) {
GetThreadEntry()->pool.FreeWorkspace(dev, data);
typedef dmlc::ThreadLocalStore<OpenCLThreadEntry> OpenCLThreadStore;
OpenCLThreadEntry* OpenCLThreadEntry::ThreadLocal() { return OpenCLThreadStore::Get(); }
std::string GetPlatformInfo(cl_platform_id pid, cl_platform_info param_name) {
size_t ret_size;
OPENCL_CALL(clGetPlatformInfo(pid, param_name, 0, nullptr, &ret_size));
std::string ret;
OPENCL_CALL(clGetPlatformInfo(pid, param_name, ret_size, &ret[0], nullptr));
return ret;
std::string GetDeviceInfo(cl_device_id pid, cl_device_info param_name) {
size_t ret_size;
OPENCL_CALL(clGetDeviceInfo(pid, param_name, 0, nullptr, &ret_size));
char* info = new char[ret_size];
OPENCL_CALL(clGetDeviceInfo(pid, param_name, ret_size, info, nullptr));
std::string ret = info;
delete[] info;
return ret;
std::string GetOpenCLVersion(cl_device_id pid) {
// String returned is "OpenCL $MAJOR.$MINOR $VENDOR_INFO". To
// match other implementations, we want to return "$MAJOR.$MINOR"
std::string ret = GetDeviceInfo(pid, CL_DEVICE_VERSION);
const size_t version_start = 7; // Length of initial "OpenCL " prefix to skip
const size_t version_end = ret.find(' ', version_start);
return ret.substr(version_start, version_end - version_start);
std::vector<cl_platform_id> GetPlatformIDs() {
cl_uint ret_size;
cl_int code = clGetPlatformIDs(0, nullptr, &ret_size);
std::vector<cl_platform_id> ret;
if (code != CL_SUCCESS) return ret;
OPENCL_CALL(clGetPlatformIDs(ret_size, &ret[0], nullptr));
return ret;
std::vector<cl_device_id> GetDeviceIDs(cl_platform_id pid, std::string device_type) {
cl_device_type dtype = CL_DEVICE_TYPE_ALL;
if (device_type == "cpu") dtype = CL_DEVICE_TYPE_CPU;
if (device_type == "gpu") dtype = CL_DEVICE_TYPE_GPU;
if (device_type == "accelerator") dtype = CL_DEVICE_TYPE_ACCELERATOR;
cl_uint ret_size;
cl_int code = clGetDeviceIDs(pid, dtype, 0, nullptr, &ret_size);
std::vector<cl_device_id> ret;
if (code != CL_SUCCESS) return ret;
OPENCL_CALL(clGetDeviceIDs(pid, dtype, ret_size, &ret[0], nullptr));
return ret;
bool MatchPlatformInfo(cl_platform_id pid, cl_platform_info param_name, std::string value) {
if (value.length() == 0) return true;
std::string param_value = GetPlatformInfo(pid, param_name);
return param_value.find(value) != std::string::npos;
void OpenCLWorkspace::Init(const std::string& type_key, const std::string& device_type,
const std::string& platform_name) {
if (initialized_) return;
std::lock_guard<std::mutex> lock(this->mu);
if (initialized_) return;
if (context != nullptr) return;
this->type_key = type_key;
// matched platforms
std::vector<cl_platform_id> platform_ids = cl::GetPlatformIDs();
if (platform_ids.size() == 0) {
LOG(WARNING) << "No OpenCL platform matched given existing options ...";
this->platform_id = nullptr;
for (auto platform_id : platform_ids) {
if (!MatchPlatformInfo(platform_id, CL_PLATFORM_NAME, platform_name)) {
std::vector<cl_device_id> devices_matched = cl::GetDeviceIDs(platform_id, device_type);
if ((devices_matched.size() == 0) && (device_type == "gpu")) {
LOG(WARNING) << "Using CPU OpenCL device";
devices_matched = cl::GetDeviceIDs(platform_id, "cpu");
std::vector<cl_device_id> supported_devices = {};
auto get_version_str = [](int version) {
std::ostringstream out;
out << std::fixed << version / 100.f;
return out.str();
for (auto& device : devices_matched) {
std::string ver = GetOpenCLVersion(device);
int opencl_version = std::stod(ver) * 100;
if (opencl_version >= CL_TARGET_OPENCL_VERSION) {
} else {
std::string dev_msg = GetDeviceInfo(device, CL_DEVICE_NAME) +
" has OpenCL version == " + get_version_str(opencl_version);
LOG(WARNING) << "TVM supports devices with OpenCL version >= "
<< get_version_str(CL_TARGET_OPENCL_VERSION) << ", device " << dev_msg
<< ". This device will be ignored.";
if (noDevicesErrorMsg.empty()) {
noDevicesErrorMsg =
"Probably this error happen because TVM supports devices with OpenCL version >= " +
get_version_str(CL_TARGET_OPENCL_VERSION) + ". We found the following devices:\n";
noDevicesErrorMsg += "\t" + dev_msg + "\n";
if (supported_devices.size() > 0) {
this->platform_id = platform_id;
this->platform_name = cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME);
this->device_type = device_type;
this->devices = supported_devices;
if (this->platform_id == nullptr) {
LOG(WARNING) << "No OpenCL device";
initialized_ = true;
cl_int err_code;
this->context = clCreateContext(nullptr, this->devices.size(), &(this->devices[0]), nullptr,
nullptr, &err_code);
ICHECK_EQ(this->queues.size(), 0U);
for (size_t i = 0; i < this->devices.size(); ++i) {
cl_device_id did = this->devices[i];
this->queues.push_back(clCreateCommandQueue(this->context, did, 0, &err_code));
initialized_ = true;
TVM_REGISTER_GLOBAL("device_api.opencl.alloc_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
int32_t device_type = args[0];
int32_t device_id = args[1];
int32_t dtype_code_hint = args[2];
int32_t dtype_bits_hint = args[3];
std::string scope = args[4];
CHECK(scope.find("texture") != std::string::npos);
int64_t ndim = args[5];
CHECK_EQ(ndim, 2);
int64_t* shape = static_cast<int64_t*>(static_cast<void*>(args[6]));
int64_t width = shape[0];
int64_t height = shape[1];
Device dev;
dev.device_type = static_cast<DLDeviceType>(device_type);
dev.device_id = device_id;
DLDataType type_hint;
type_hint.code = static_cast<decltype(type_hint.code)>(dtype_code_hint);
type_hint.bits = static_cast<decltype(type_hint.bits)>(dtype_bits_hint);
type_hint.lanes = 1;
OpenCLWorkspace* ptr = OpenCLWorkspace::Global();
*rv = ptr->AllocTextureWorkspace(dev, static_cast<size_t>(width), static_cast<size_t>(height),
TVM_REGISTER_GLOBAL("device_api.opencl.free_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
int32_t device_type = args[0];
int32_t device_id = args[1];
std::string scope = args[2];
CHECK(scope.find("texture") != std::string::npos);
void* data = args[3];
OpenCLWorkspace* ptr = OpenCLWorkspace::Global();
Device dev;
dev.device_type = static_cast<DLDeviceType>(device_type);
dev.device_id = device_id;
ptr->FreeTextureWorkspace(dev, data);
*rv = static_cast<int32_t>(0);
TVM_REGISTER_GLOBAL("device_api.opencl").set_body([](TVMArgs args, TVMRetValue* rv) {
DeviceAPI* ptr = OpenCLWorkspace::Global();
*rv = static_cast<void*>(ptr);
TVM_REGISTER_GLOBAL("profiling.timer.opencl").set_body_typed([](Device dev) {
return Timer(make_object<OpenCLTimerNode>(dev));
} // namespace cl
size_t OpenCLTimerNode::count_timer_execs = 0;
std::vector<size_t> OpenCLTimerNode::event_start_idxs;
} // namespace runtime
} // namespace tvm