blob: 8b6fba24988e963574f82d7506ecd6e3a3dfe2d1 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file opencl_device_api.cc
*/
#include <dmlc/parameter.h>
#include <dmlc/thread_local.h>
#include <tvm/ffi/function.h>
#include <tvm/ffi/reflection/registry.h>
#include <tvm/runtime/profiling.h>
#include <sstream>
#include "../memory/pooled_allocator.h"
#include "opencl_common.h"
#ifdef OPENCL_ENABLE_HOST_PTR
#define CL_MEM_CREATE_FLAGS CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR
#else
#define CL_MEM_CREATE_FLAGS CL_MEM_READ_WRITE
#endif
namespace tvm {
namespace runtime {
namespace cl {
std::string GetPlatformInfo(cl_platform_id pid, cl_platform_info param_name);
std::string GetDeviceInfo(cl_device_id pid, cl_device_info param_name);
std::string GetOpenCLVersion(cl_device_id pid);
struct ImageInfo {
size_t origin[3] = {};
size_t region[3] = {};
size_t row_pitch = 0;
size_t slice_pitch = 0;
};
/*!
* \brief Utility to apply a memory layout specific lowering convention
* to infer the physical shape from the provided DLTensor's logical shape.
* \param desc Descriptor which contains the buffer and layout tag.
* \param The DLTensor used to infer the tensors physical shape.
*/
ImageInfo GetImageInfo(const cl::BufferDescriptor* desc, const DLTensor* tensor) {
ImageInfo info{};
ICHECK(tensor->dtype.lanes == 1) << "Image dtype has lanes: " << tensor->dtype.lanes;
info.origin[0] = info.origin[1] = info.origin[2] = 0;
info.row_pitch = 0;
info.slice_pitch = 0;
size_t axis = DefaultTextureLayoutSeparator(
tensor->ndim, cl::BufferDescriptor::ScopeFromMemoryLayout(desc->layout));
auto texture_shape = ApplyTexture2DFlattening<int64_t>(tensor->shape, tensor->ndim, axis);
info.region[0] = texture_shape.width;
info.region[1] = texture_shape.height;
info.region[2] = 1;
return info;
}
cl::BufferDescriptor::MemoryLayout cl::BufferDescriptor::MemoryLayoutFromScope(
ffi::Optional<ffi::String> mem_scope) {
if (!mem_scope.has_value()) {
return cl::BufferDescriptor::MemoryLayout::kBuffer1D;
} else if (mem_scope.value() == "global.texture") {
return cl::BufferDescriptor::MemoryLayout::kImage2DActivation;
} else if (mem_scope.value() == "global.texture-weight") {
return cl::BufferDescriptor::MemoryLayout::kImage2DWeight;
} else if (mem_scope.value() == "global.texture-nhwc") {
return cl::BufferDescriptor::MemoryLayout::kImage2DNHWC;
}
LOG(FATAL) << "No memory layout defined for memory of scope: " << mem_scope.value();
}
ffi::String cl::BufferDescriptor::ScopeFromMemoryLayout(cl::BufferDescriptor::MemoryLayout layout) {
switch (layout) {
case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
return "global";
case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
return "global.texture";
case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
return "global.texture-weight";
case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC:
return "global.texture-nhwc";
}
LOG(FATAL) << "No scope corresponding to the provided memory layout: "
<< static_cast<int>(layout);
return "";
}
static size_t GetMemObjectSize(Device dev, int ndim, const int64_t* shape, DLDataType dtype) {
DLTensor temp;
temp.data = nullptr;
temp.device = dev;
temp.ndim = ndim;
temp.dtype = dtype;
temp.shape = const_cast<int64_t*>(shape);
temp.strides = nullptr;
temp.byte_offset = 0;
size_t size = DeviceAPI::Get(dev)->GetDataSize(temp);
return size;
}
OpenCLThreadEntry* OpenCLWorkspace::GetThreadEntry() { return OpenCLThreadEntry::ThreadLocal(); }
OpenCLWorkspace* OpenCLWorkspace::Global() {
static OpenCLWorkspace* inst = new OpenCLWorkspace();
return inst;
}
cl_device_id OpenCLWorkspace::GetCLDeviceID(int device_id) {
this->Init();
ICHECK_LT(device_id, devices.size()) << "Invalid device id " << device_id << ". " << GetError();
return devices[device_id];
}
void OpenCLWorkspace::SetDevice(Device dev) { GetThreadEntry()->device.device_id = dev.device_id; }
void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, ffi::Any* rv) {
this->Init();
size_t index = static_cast<size_t>(dev.device_id);
if (kind == kExist) {
*rv = static_cast<int>(index < devices.size());
return;
}
cl_device_id device_id = GetCLDeviceID(index);
switch (kind) {
case kExist:
break;
case kMaxThreadsPerBlock: {
size_t value;
OPENCL_CALL(clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &value,
nullptr));
*rv = static_cast<int64_t>(value);
break;
}
case kWarpSize: {
/* TODO: the warp size of OpenCL device is not always 1
e.g. Intel Graphics has a sub group concept which contains 8 - 32 work items,
corresponding to the number of SIMD entries the heardware configures.
We need to figure out a way to query this information from the hardware.
*/
const int warp_size = dmlc::GetEnv("TVM_OPENCL_WARP_SIZE", 1);
*rv = warp_size;
break;
}
case kMaxSharedMemoryPerBlock: {
cl_ulong value;
OPENCL_CALL(
clGetDeviceInfo(device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &value, nullptr));
*rv = static_cast<int64_t>(value);
break;
}
case kComputeVersion:
*rv = GetOpenCLVersion(device_id);
break;
case kDeviceName:
*rv = GetDeviceInfo(device_id, CL_DEVICE_NAME);
break;
case kMaxClockRate: {
cl_uint value;
OPENCL_CALL(clGetDeviceInfo(device_id, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint), &value,
nullptr));
// OpenCL returns the clock rate in MHz, while CUDA/ROCm return the
// clock rate in kHz. Converting to the same units for each.
*rv = static_cast<int32_t>(value * 1000);
break;
}
case kMultiProcessorCount: {
cl_uint value;
OPENCL_CALL(clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &value,
nullptr));
*rv = static_cast<int32_t>(value);
break;
}
case kMaxThreadDimensions: {
size_t dims[3];
OPENCL_CALL(
clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(dims), dims, nullptr));
std::stringstream ss; // use json string to return multiple int values;
ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]";
*rv = ss.str();
break;
}
case kMaxRegistersPerBlock:
return;
case kGcnArch:
return;
case kApiVersion: {
*rv = CL_TARGET_OPENCL_VERSION;
break;
}
case kDriverVersion: {
char value[128] = {0};
OPENCL_CALL(clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(value) - 1, value, nullptr));
*rv = std::string(value);
break;
}
case kL2CacheSizeBytes: {
// NOTE(Zihao): this API cannot reflect the real L2 cache size in both CUDA/AMD GPUs.
cl_ulong value;
OPENCL_CALL(clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(value), &value,
nullptr));
*rv = static_cast<int64_t>(value);
break;
}
case kTotalGlobalMemory: {
cl_ulong total_global_memory;
OPENCL_CALL(clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(total_global_memory),
&total_global_memory, nullptr));
*rv = static_cast<int64_t>(total_global_memory);
return;
}
case kAvailableGlobalMemory:
// Not currently implemented. Based on
// https://stackoverflow.com/a/3568223, may not be implementable
// at all through OpenCL API.
break;
case kImagePitchAlignment: {
*rv = static_cast<int64_t>(device_info[device_id].image_row_align);
break;
}
}
}
void* OpenCLWorkspace::CreateHostPtrIfEnabled(cl::BufferDescriptor* desc, Device dev, size_t size) {
#if defined(OPENCL_ENABLE_HOST_PTR)
this->Init();
cl_int err_code;
desc->host_ptr = reinterpret_cast<cl_uchar*>(
clEnqueueMapBuffer(this->GetQueue(dev), desc->buffer, CL_TRUE, CL_MAP_WRITE, 0,
sizeof(cl_uchar) * size, 0, nullptr, nullptr, &err_code));
OPENCL_CHECK_ERROR(err_code);
#endif // OPENCL_ENABLE_HOST_PTR
return desc;
}
void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,
DLDataType type_hint) {
this->Init();
return AllocCLBuffer(dev, size, alignment, type_hint);
}
void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t width, size_t height, DLDataType type_hint,
ffi::Optional<ffi::String> mem_scope) {
// Texture allocation given width and height
cl_uint row_align = GetImageAlignment(dev.device_id);
size_t pixel_size = (type_hint.bits * type_hint.lanes + 7) / 8;
size_t row_pitch = ALIGN_UP(width * pixel_size * 4, row_align); // CL_RGBA = 4
size_t mem_size = row_pitch * height;
// Alloc back buffer from pool
cl::BufferDescriptor* back_buffer = nullptr;
if (IsBufferToImageSupported(dev.device_id)) {
auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kPooled)
->Alloc(dev, mem_size, kTempAllocaAlignment, type_hint);
back_buffer = static_cast<cl::BufferDescriptor*>(buf.data);
back_buffer->mbuf = buf;
}
if (!mem_scope.has_value()) {
mem_scope = ffi::String("global.texture");
}
return AllocCLImage(dev, back_buffer, width, height, row_pitch, type_hint, mem_scope);
}
void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
ffi::Optional<ffi::String> mem_scope) {
this->Init();
if (!mem_scope.has_value() || (*mem_scope).empty() || (*mem_scope) == "global") {
size_t size = GetMemObjectSize(dev, ndim, shape, dtype);
cl::BufferDescriptor* ret_buffer = nullptr;
auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kPooled)
->Alloc(dev, size, kTempAllocaAlignment, dtype);
ret_buffer = static_cast<cl::BufferDescriptor*>(buf.data);
ret_buffer->mbuf = buf;
return ret_buffer;
}
size_t axis = DefaultTextureLayoutSeparator(ndim, mem_scope.value());
auto texture = ApplyTexture2DFlattening<int64_t>(shape, ndim, axis);
return AllocDataSpace(dev, texture.width, texture.height, dtype, mem_scope);
}
void* OpenCLWorkspace::AllocCLBuffer(Device dev, size_t size, size_t alignment,
DLDataType type_hint) {
this->Init();
cl_device_id device_id = GetCLDeviceID(dev.device_id);
auto platform = device_info[device_id].platform_id;
cl_int err_code;
cl::BufferDescriptor* desc = new cl::BufferDescriptor;
// CL_INVALID_BUFFER_SIZE if size is 0.
if (size == 0) {
size = 1;
}
desc->buffer =
clCreateBuffer(this->contexts[platform], CL_MEM_CREATE_FLAGS, size, nullptr, &err_code);
desc->layout = cl::BufferDescriptor::MemoryLayout::kBuffer1D;
OPENCL_CHECK_ERROR(err_code);
return CreateHostPtrIfEnabled(desc, dev, size);
}
void* OpenCLWorkspace::AllocCLImage(Device dev, void* back_buffer, size_t width, size_t height,
size_t row_pitch, DLDataType type_hint,
ffi::Optional<ffi::String> mem_scope) {
this->Init();
ICHECK(std::string(mem_scope.value()).find("texture") != std::string::npos)
<< "Expect texture scope while creating an Image object";
cl::BufferDescriptor* back_desc = static_cast<cl::BufferDescriptor*>(back_buffer);
cl_device_id device_id = GetCLDeviceID(dev.device_id);
auto platform = device_info[device_id].platform_id;
cl_int err_code;
cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint);
cl_image_format format = {CL_RGBA, cl_type};
cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 0, 0, 0};
if (IsBufferToImageSupported(dev.device_id)) {
descriptor.image_row_pitch = row_pitch;
descriptor.buffer = back_desc->buffer;
}
cl_mem mptr = clCreateImage(this->contexts[platform], CL_MEM_CREATE_FLAGS, &format, &descriptor,
nullptr, &err_code);
OPENCL_CHECK_ERROR(err_code);
cl::BufferDescriptor* desc = new cl::BufferDescriptor(mem_scope);
desc->buffer = mptr;
desc->back_buffer = back_desc;
return desc;
}
size_t OpenCLWorkspace::GetDataSize(const DLTensor& arr, ffi::Optional<ffi::String> mem_scope) {
if (!mem_scope.has_value() || (*mem_scope).empty() || (*mem_scope) == "global") {
return DeviceAPI::GetDataSize(arr);
}
cl_uint row_align = GetImageAlignment(GetThreadEntry()->device.device_id);
std::vector<int64_t> shape;
shape.assign(arr.shape, arr.shape + arr.ndim);
return runtime::GetTextureMemorySize<std::vector<int64_t>>(shape, arr.dtype.bits, arr.dtype.lanes,
mem_scope.value(), row_align);
}
void* OpenCLWorkspace::AllocDataSpaceView(Device dev, void* data, ffi::Shape shape,
DLDataType dtype, ffi::Optional<ffi::String> mem_scope) {
cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(data);
// Fall back for devices w/o "cl_khr_image2d_from_buffer"
if (!IsBufferToImageSupported(dev.device_id)) {
cl::BufferDescriptor* ret_desc = desc; // buffer -> buffer
if (!mem_scope.has_value() || (*mem_scope).empty() || (*mem_scope) == "global") {
if (desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
// image -> buffer
size_t nbytes = GetMemObjectSize(dev, shape.size(), shape.data(), dtype);
ret_desc = static_cast<cl::BufferDescriptor*>(
OpenCLWorkspace::AllocCLBuffer(dev, nbytes, kTempAllocaAlignment, dtype));
ret_desc->is_compat_view = true;
}
} else {
// Any -> Image
size_t axis = DefaultTextureLayoutSeparator(shape.size(), mem_scope.value());
auto texture = ApplyTexture2DFlattening<int64_t>(shape.data(), shape.size(), axis);
cl_uint row_align = GetImageAlignment(dev.device_id);
size_t pixel_size = (dtype.bits * dtype.lanes + 7) / 8;
size_t row_pitch = ALIGN_UP(texture.width * pixel_size * 4, row_align); // CL_RGBA = 4
ret_desc = static_cast<cl::BufferDescriptor*>(OpenCLWorkspace::Global()->AllocCLImage(
dev, nullptr, texture.width, texture.height, row_pitch, dtype, mem_scope));
ret_desc->is_compat_view = true;
}
return ret_desc;
}
if (!mem_scope.has_value() || (*mem_scope).empty() || (*mem_scope) == "global") {
if (desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
// buffer -> buffer
return desc;
} else {
// image -> buffer
return desc->back_buffer;
}
}
size_t axis = DefaultTextureLayoutSeparator(shape.size(), mem_scope.value());
auto texture = ApplyTexture2DFlattening<int64_t>(shape.data(), shape.size(), axis);
cl_uint row_align = GetImageAlignment(dev.device_id);
size_t pixel_size = (dtype.bits * dtype.lanes + 7) / 8;
size_t row_pitch = ALIGN_UP(texture.width * pixel_size * 4, row_align); // CL_RGBA = 4
cl::BufferDescriptor* back_buffer;
if (desc->back_buffer) {
// image -> image
back_buffer = desc->back_buffer;
} else {
// buffer -> image
back_buffer = desc;
}
return (cl::BufferDescriptor*)AllocCLImage(dev, back_buffer, texture.width, texture.height,
row_pitch, dtype, mem_scope);
}
void OpenCLWorkspace::FreeDataSpaceView(Device dev, void* ptr) {
auto* desc = static_cast<const cl::BufferDescriptor*>(ptr);
// Handle the fall back
if (!IsBufferToImageSupported(dev.device_id)) {
if (desc->is_compat_view) {
OPENCL_CALL(clReleaseMemObject(desc->buffer));
delete desc;
}
return;
}
if (desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
OPENCL_CALL(clReleaseMemObject(desc->buffer));
delete desc;
}
}
void* OpenCLWorkspace::GetNativePtr(const tvm::runtime::Tensor& narr) {
cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(narr.operator->()->data);
return desc->host_ptr;
}
void OpenCLWorkspace::SetNativePtr(const tvm::runtime::Tensor& narr, void* host_ptr,
size_t buf_size) {
cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(narr.operator->()->data);
this->Init();
if (desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
#ifdef USE_OPENCL_EXTN_QCOM
Device dev = narr.operator->()->device;
cl_device_id device_id = GetCLDeviceID(dev.device_id);
auto platform = device_info[device_id].platform_id;
if (desc->host_ptr) {
OPENCL_CALL(clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer,
reinterpret_cast<void*>(desc->host_ptr), 0, nullptr,
nullptr));
desc->host_ptr = nullptr;
}
OPENCL_CALL(clReleaseMemObject(desc->buffer));
cl_int err_code;
desc->buffer =
clCreateBuffer(this->contexts[platform],
CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR | CL_MEM_EXT_HOST_PTR_QCOM, buf_size,
host_ptr, &err_code);
desc->layout = cl::BufferDescriptor::MemoryLayout::kBuffer1D;
OPENCL_CHECK_ERROR(err_code);
#endif
} else {
LOG(FATAL) << "Native Ptr not enabled over image objects";
}
}
void OpenCLWorkspace::SetPerfHint(Device dev, cl_uint perf_hint) {
#ifdef CL_CONTEXT_PERF_HINT_QCOM
cl_device_id device_id = GetCLDeviceID(dev.device_id);
auto platform = device_info[device_id].platform_id;
OPENCL_CALL(clSetPerfHintQCOM(this->contexts[platform], perf_hint));
#endif
}
void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(ptr);
if (desc->back_buffer) {
// 2D Image w/ back buffer allocated from pool
OPENCL_CALL(clReleaseMemObject(desc->buffer));
MemoryManager::GetAllocator(dev, desc->back_buffer->mbuf.alloc_type)
->Free(desc->back_buffer->mbuf);
delete desc;
} else {
if (desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
// 1D buffer allocated from pool
if (desc->host_ptr) {
clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer,
reinterpret_cast<void*>(desc->host_ptr), 0, nullptr, nullptr);
}
OPENCL_CALL(clReleaseMemObject(desc->buffer));
delete desc;
} else if (!IsBufferToImageSupported(dev.device_id)) {
// 2D Image allocated w/o pool
OPENCL_CALL(clReleaseMemObject(desc->buffer));
delete desc;
return;
}
}
}
void OpenCLWorkspace::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
this->Init();
size_t nbytes = GetDataSize(*from);
ICHECK_EQ(nbytes, GetDataSize(*to));
ICHECK(IsContiguous(*from) && IsContiguous(*to))
<< "CopyDataFromTo only support contiguous array for now";
if (IsOpenCLDevice(from->device) && IsOpenCLDevice(to->device)) {
const auto* from_desc = static_cast<const cl::BufferDescriptor*>(from->data);
auto* to_desc = static_cast<cl::BufferDescriptor*>(to->data);
if (to_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D &&
from_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(to->device), from_desc->buffer,
to_desc->buffer, from->byte_offset, to->byte_offset, nbytes,
0, nullptr, nullptr));
} else if (to_desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D &&
from_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
auto image_info = GetImageInfo(to_desc, to);
OPENCL_CALL(clEnqueueCopyBufferToImage(this->GetQueue(to->device), from_desc->buffer,
to_desc->buffer, from->byte_offset, image_info.origin,
image_info.region, 0, nullptr, nullptr));
} else if (to_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D &&
from_desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
auto image_info = GetImageInfo(from_desc, from);
OPENCL_CALL(clEnqueueCopyImageToBuffer(this->GetQueue(to->device), from_desc->buffer,
to_desc->buffer, image_info.origin, image_info.region,
to->byte_offset, 0, nullptr, nullptr));
} else {
auto to_image_info = GetImageInfo(to_desc, to);
auto from_image_info = GetImageInfo(from_desc, from);
OPENCL_CALL(clEnqueueCopyImage(this->GetQueue(to->device), from_desc->buffer, to_desc->buffer,
from_image_info.origin, to_image_info.origin,
to_image_info.region, 0, nullptr, nullptr));
}
} else if (IsOpenCLDevice(from->device) && to->device.device_type == kDLCPU) {
const auto* from_desc = static_cast<const cl::BufferDescriptor*>(from->data);
switch (from_desc->layout) {
case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
OPENCL_CALL(clEnqueueReadBuffer(
this->GetQueue(from->device), from_desc->buffer, CL_FALSE, from->byte_offset, nbytes,
static_cast<char*>(to->data) + to->byte_offset, 0, nullptr, nullptr));
break;
case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC:
auto image_info = GetImageInfo(from_desc, from);
// TODO(csullivan): Support calculating row_pitch correctly in the case of reuse.
// Note that when utilizing texture pools for memory reuse, the allocated image
// size can be larger than the size to be read.
OPENCL_CALL(clEnqueueReadImage(
this->GetQueue(from->device), from_desc->buffer, CL_FALSE, image_info.origin,
image_info.region, image_info.row_pitch, image_info.slice_pitch,
static_cast<char*>(to->data) + to->byte_offset, 0, nullptr, nullptr));
break;
}
OPENCL_CALL(clFinish(this->GetQueue(from->device)));
} else if (from->device.device_type == kDLCPU && IsOpenCLDevice(to->device)) {
auto* to_desc = static_cast<cl::BufferDescriptor*>(to->data);
switch (to_desc->layout) {
case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
OPENCL_CALL(clEnqueueWriteBuffer(
this->GetQueue(to->device), to_desc->buffer, CL_FALSE, to->byte_offset, nbytes,
static_cast<const char*>(from->data) + from->byte_offset, 0, nullptr, nullptr));
break;
case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC:
auto image_info = GetImageInfo(to_desc, to);
OPENCL_CALL(clEnqueueWriteImage(
this->GetQueue(to->device), to_desc->buffer, CL_FALSE, image_info.origin,
image_info.region, image_info.row_pitch, image_info.slice_pitch,
static_cast<const char*>(from->data) + from->byte_offset, 0, nullptr, nullptr));
break;
}
OPENCL_CALL(clFinish(this->GetQueue(to->device)));
} else {
LOG(FATAL) << "Expect copy from/to OpenCL or between OpenCL";
}
}
void OpenCLWorkspace::StreamSync(Device dev, TVMStreamHandle stream) {
this->Init();
ICHECK(stream == nullptr);
OPENCL_CALL(clFinish(this->GetQueue(dev)));
}
void* OpenCLWorkspace::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
this->Init();
cl::BufferDescriptor* ret_buffer = nullptr;
auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kPooled)
->Alloc(dev, size, kTempAllocaAlignment, type_hint);
ret_buffer = static_cast<cl::BufferDescriptor*>(buf.data);
ret_buffer->mbuf = buf;
return ret_buffer;
}
void OpenCLWorkspace::FreeWorkspace(Device dev, void* data) {
cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(data);
MemoryManager::GetAllocator(dev, desc->mbuf.alloc_type)->Free(desc->mbuf);
}
typedef dmlc::ThreadLocalStore<OpenCLThreadEntry> OpenCLThreadStore;
OpenCLThreadEntry* OpenCLThreadEntry::ThreadLocal() { return OpenCLThreadStore::Get(); }
std::string GetPlatformInfo(cl_platform_id pid, cl_platform_info param_name) {
size_t ret_size;
OPENCL_CALL(clGetPlatformInfo(pid, param_name, 0, nullptr, &ret_size));
std::string ret;
ret.resize(ret_size);
OPENCL_CALL(clGetPlatformInfo(pid, param_name, ret_size, &ret[0], nullptr));
return ret;
}
std::string GetDeviceInfo(cl_device_id pid, cl_device_info param_name) {
size_t ret_size;
OPENCL_CALL(clGetDeviceInfo(pid, param_name, 0, nullptr, &ret_size));
char* info = new char[ret_size];
OPENCL_CALL(clGetDeviceInfo(pid, param_name, ret_size, info, nullptr));
std::string ret = info;
delete[] info;
return ret;
}
std::string GetOpenCLVersion(cl_device_id pid) {
// ffi::String returned is "OpenCL $MAJOR.$MINOR $VENDOR_INFO". To
// match other implementations, we want to return "$MAJOR.$MINOR"
std::string ret = GetDeviceInfo(pid, CL_DEVICE_VERSION);
const size_t version_start = 7; // Length of initial "OpenCL " prefix to skip
const size_t version_end = ret.find(' ', version_start);
return ret.substr(version_start, version_end - version_start);
}
std::vector<cl_platform_id> GetPlatformIDs() {
cl_uint ret_size;
cl_int code = clGetPlatformIDs(0, nullptr, &ret_size);
std::vector<cl_platform_id> ret;
if (code != CL_SUCCESS) return ret;
ret.resize(ret_size);
OPENCL_CALL(clGetPlatformIDs(ret_size, &ret[0], nullptr));
return ret;
}
std::vector<cl_device_id> GetDeviceIDs(cl_platform_id pid, std::string device_type) {
cl_device_type dtype = CL_DEVICE_TYPE_ALL;
if (device_type == "cpu") dtype = CL_DEVICE_TYPE_CPU;
if (device_type == "gpu") dtype = CL_DEVICE_TYPE_GPU;
if (device_type == "accelerator") dtype = CL_DEVICE_TYPE_ACCELERATOR;
cl_uint ret_size;
cl_int code = clGetDeviceIDs(pid, dtype, 0, nullptr, &ret_size);
std::vector<cl_device_id> ret;
if (code != CL_SUCCESS) return ret;
ret.resize(ret_size);
OPENCL_CALL(clGetDeviceIDs(pid, dtype, ret_size, &ret[0], nullptr));
return ret;
}
bool MatchPlatformInfo(cl_platform_id pid, cl_platform_info param_name, std::string value) {
if (value.length() == 0) return true;
std::string param_value = GetPlatformInfo(pid, param_name);
return param_value.find(value) != std::string::npos;
}
void OpenCLWorkspace::Init(const std::string& type_key, const std::string& device_type,
const std::string& platform_name, cl_context_properties ctx_props[]) {
if (initialized_) return;
std::lock_guard<std::mutex> lock(this->mu);
if (initialized_) return;
this->type_key = type_key;
// matched platforms
std::vector<cl_platform_id> platform_ids = cl::GetPlatformIDs();
if (platform_ids.size() == 0) {
LOG(WARNING) << "No OpenCL platform matched given existing options ...";
return;
}
auto find_opencl_device = [&](const std::string& device_type, const std::string& platform_name) {
std::unordered_map<cl_platform_id, std::vector<cl_device_id>> device_map;
for (auto platform_id : platform_ids) {
if (!MatchPlatformInfo(platform_id, CL_PLATFORM_NAME, platform_name)) {
continue;
}
std::vector<cl_device_id> devices_matched = cl::GetDeviceIDs(platform_id, device_type);
std::vector<cl_device_id> supported_devices = {};
auto get_version_str = [](int version) {
std::ostringstream out;
out.precision(1);
out << std::fixed << version / 100.f;
return out.str();
};
for (auto& device : devices_matched) {
std::string ver = GetOpenCLVersion(device);
int opencl_version = std::stod(ver) * 100;
if (opencl_version >= CL_TARGET_OPENCL_VERSION) {
supported_devices.push_back(device);
} else {
std::string dev_msg = GetDeviceInfo(device, CL_DEVICE_NAME) +
" has OpenCL version == " + get_version_str(opencl_version);
LOG(WARNING) << "TVM supports devices with OpenCL version >= "
<< get_version_str(CL_TARGET_OPENCL_VERSION) << ", device " << dev_msg
<< ". This device will be ignored.";
if (noDevicesErrorMsg.empty()) {
noDevicesErrorMsg =
"Probably this error happen because TVM supports devices with OpenCL version >= " +
get_version_str(CL_TARGET_OPENCL_VERSION) + ". We found the following devices:\n";
}
noDevicesErrorMsg += "\t" + dev_msg + "\n";
}
}
if (supported_devices.size()) {
device_map[platform_id] = supported_devices;
}
}
return device_map;
};
auto device_map = find_opencl_device(device_type, platform_name);
if ((device_map.size() == 0) && (device_type == "gpu")) {
LOG(WARNING) << "Using CPU OpenCL device";
device_map = find_opencl_device("cpu", "");
}
if (device_map.empty()) {
LOG(WARNING) << "No OpenCL device";
initialized_ = true;
return;
}
ICHECK_EQ(this->queues.size(), 0U);
cl_int err_code;
for (auto& [platform, devices] : device_map) {
this->platform_ids.push_back(platform);
this->contexts[platform] =
clCreateContext(ctx_props, devices.size(), &(devices[0]), nullptr, nullptr, &err_code);
this->devices.insert(this->devices.end(), devices.begin(), devices.end());
for (size_t i = 0; i < devices.size(); ++i) {
cl_device_id did = devices[i];
CLDeviceInfo dev_info;
dev_info.platform_id = platform;
this->queues.push_back(clCreateCommandQueue(this->contexts[platform], did, 0, &err_code));
OPENCL_CHECK_ERROR(err_code);
cl_uint row_pitch;
OPENCL_CALL(clGetDeviceInfo(did, CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR, sizeof(row_pitch),
&row_pitch, nullptr));
if (0 == row_pitch) {
row_pitch = kAllocAlignment; // Fallback
}
dev_info.image_row_align = row_pitch;
dev_info.image_from_buffer_support =
IsOpenCLExtensionSupported(did, "cl_khr_image2d_from_buffer");
device_info.insert({did, dev_info});
}
OPENCL_CHECK_ERROR(err_code);
}
this->events.resize(this->devices.size());
initialized_ = true;
}
TVM_FFI_STATIC_INIT_BLOCK() {
namespace refl = tvm::ffi::reflection;
refl::GlobalDef()
.def_packed("device_api.opencl.alloc_nd",
[](ffi::PackedArgs args, ffi::Any* rv) {
int32_t device_type = args[0].cast<int32_t>();
int32_t device_id = args[1].cast<int32_t>();
int32_t dtype_code_hint = args[2].cast<int32_t>();
int32_t dtype_bits_hint = args[3].cast<int32_t>();
auto scope = args[4].cast<std::string>();
CHECK(scope.find("texture") != std::string::npos);
int64_t ndim = args[5].cast<int64_t>();
CHECK_EQ(ndim, 2);
int64_t* shape = static_cast<int64_t*>(args[6].cast<void*>());
int64_t width = shape[0];
int64_t height = shape[1];
Device dev;
dev.device_type = static_cast<DLDeviceType>(device_type);
dev.device_id = device_id;
DLDataType type_hint;
type_hint.code = static_cast<decltype(type_hint.code)>(dtype_code_hint);
type_hint.bits = static_cast<decltype(type_hint.bits)>(dtype_bits_hint);
type_hint.lanes = 1;
*rv = OpenCLWorkspace::Global()->AllocDataSpace(
dev, static_cast<size_t>(width), static_cast<size_t>(height), type_hint,
ffi::String("global.texture"));
})
.def_packed("device_api.opencl.free_nd",
[](ffi::PackedArgs args, ffi::Any* rv) {
int32_t device_type = args[0].cast<int32_t>();
int32_t device_id = args[1].cast<int32_t>();
auto scope = args[2].cast<std::string>();
CHECK(scope.find("texture") != std::string::npos);
void* data = args[3].cast<void*>();
OpenCLWorkspace* ptr = OpenCLWorkspace::Global();
Device dev;
dev.device_type = static_cast<DLDeviceType>(device_type);
dev.device_id = device_id;
ptr->FreeDataSpace(dev, data);
*rv = static_cast<int32_t>(0);
})
.def_packed("device_api.opencl", [](ffi::PackedArgs args, ffi::Any* rv) {
DeviceAPI* ptr = OpenCLWorkspace::Global();
*rv = static_cast<void*>(ptr);
});
}
TVM_FFI_STATIC_INIT_BLOCK() {
namespace refl = tvm::ffi::reflection;
refl::GlobalDef().def("profiling.timer.opencl",
[](Device dev) { return Timer(ffi::make_object<OpenCLTimerNode>(dev)); });
}
class OpenCLPooledAllocator final : public memory::PooledAllocator {
public:
explicit OpenCLPooledAllocator() : PooledAllocator() {}
bool AllowMemoryScope(const std::string& mem_scope) const final {
return ((mem_scope.find("texture") != std::string::npos) || mem_scope.empty() ||
("global" == mem_scope));
}
Buffer Alloc(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) override {
std::lock_guard<std::recursive_mutex> lock(mu_);
size_t size = ((nbytes + page_size_ - 1) / page_size_) * page_size_;
auto&& it = memory_pool_.find(size);
if (it != memory_pool_.end() && !it->second.empty()) {
auto&& pool = it->second;
auto ret = pool.back();
pool.pop_back();
return ret;
}
Buffer buf;
buf.device = dev;
buf.size = size;
buf.alloc_type = AllocatorType::kPooled;
try {
buf.data = DeviceAllocDataSpace(dev, size, alignment, type_hint);
} catch (InternalError& err) {
LOG(WARNING) << "PooledAllocator got InternalError during allocation: " << err.what();
LOG(WARNING) << "Trying to release all unused memory and reallocate...";
ReleaseAll();
buf.data = DeviceAllocDataSpace(dev, size, alignment, type_hint);
}
used_memory_.fetch_add(size, std::memory_order_relaxed);
VLOG(1) << "allocate " << size << " B, used memory " << used_memory_ << " B";
return buf;
}
Buffer Alloc(Device dev, ffi::Shape shape, DLDataType type_hint,
const std::string& mem_scope) override {
if (AllowMemoryScope(mem_scope)) {
size_t size = ffi::GetDataSize(shape.Product(), type_hint);
Buffer buf;
buf.device = dev;
buf.size = size;
buf.alloc_type = AllocatorType::kPooled;
buf.data = DeviceAPI::Get(dev)->AllocDataSpace(dev, shape.size(), shape.data(), type_hint,
ffi::String(mem_scope));
if (mem_scope.find("texture") == std::string::npos) {
// All textures are backed by buffers - don't count in total memory
used_memory_.fetch_add(size, std::memory_order_relaxed);
}
DLOG(INFO) << "allocate " << size << " B, used memory " << used_memory_ << " B";
return buf;
}
LOG(FATAL) << "Unsupported memory scope for this Allocator:" << mem_scope;
return {};
}
void Free(const Buffer& buffer) override {
std::lock_guard<std::recursive_mutex> lock(mu_);
if (memory_pool_.find(buffer.size) == memory_pool_.end()) {
memory_pool_.emplace(buffer.size, std::vector<Buffer>{});
}
memory_pool_.at(buffer.size).push_back(buffer);
VLOG(1) << "reclaim buffer " << buffer.size;
}
void* CreateView(const Buffer& buffer, ffi::Shape shape, DLDataType type_hint,
const std::string& mem_scope) final {
OpenCLWorkspace* ws_ = OpenCLWorkspace::Global();
return ws_->AllocDataSpaceView(buffer.device, buffer.data, shape, type_hint,
ffi::String(mem_scope));
}
void FreeView(Device dev, void* data) final {
OpenCLWorkspace* ws_ = OpenCLWorkspace::Global();
return ws_->FreeDataSpaceView(dev, data);
}
};
TVM_FFI_STATIC_INIT_BLOCK() {
namespace refl = tvm::ffi::reflection;
refl::GlobalDef().def_packed("DeviceAllocator.opencl", [](ffi::PackedArgs args, ffi::Any* rv) {
Allocator* alloc = new OpenCLPooledAllocator();
*rv = static_cast<void*>(alloc);
});
}
} // namespace cl
size_t OpenCLTimerNode::count_timer_execs = 0;
std::vector<size_t> OpenCLTimerNode::event_start_idxs;
} // namespace runtime
} // namespace tvm