src/runtime/opencl/opencl_device_api.cc - tvm - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file opencl_device_api.cc
  */
 #include <dmlc/parameter.h>
 #include <dmlc/thread_local.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/profiling.h>

 #include <sstream>

 #include "../memory/pooled_allocator.h"
 #include "opencl_common.h"

 #ifdef OPENCL_ENABLE_HOST_PTR
 #define CL_MEM_CREATE_FLAGS CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR
 #else
 #define CL_MEM_CREATE_FLAGS CL_MEM_READ_WRITE
 #endif

 namespace tvm {
 namespace runtime {
 namespace cl {

 std::string GetPlatformInfo(cl_platform_id pid, cl_platform_info param_name);
 std::string GetDeviceInfo(cl_device_id pid, cl_device_info param_name);
 std::string GetOpenCLVersion(cl_device_id pid);

 struct ImageInfo {
   size_t origin[3] = {};
   size_t region[3] = {};
   size_t row_pitch = 0;
   size_t slice_pitch = 0;
 };

 /*!
  * \brief Utility to apply a memory layout specific lowering convention
  * to infer the physical shape from the provided DLTensor's logical shape.
  * \param desc Descriptor which contains the buffer and layout tag.
  * \param The DLTensor used to infer the tensors physical shape.
  */
 ImageInfo GetImageInfo(const cl::BufferDescriptor* desc, const DLTensor* tensor) {
   ImageInfo info{};
   ICHECK(tensor->dtype.lanes == 1) << "Image dtype has lanes: " << tensor->dtype.lanes;

   info.origin[0] = info.origin[1] = info.origin[2] = 0;
   info.row_pitch = 0;
   info.slice_pitch = 0;

   size_t axis = DefaultTextureLayoutSeparator(
       tensor->ndim, cl::BufferDescriptor::ScopeFromMemoryLayout(desc->layout));
   auto texture_shape = ApplyTexture2DFlattening<int64_t>(tensor->shape, tensor->ndim, axis);
   info.region[0] = texture_shape.width;
   info.region[1] = texture_shape.height;
   info.region[2] = 1;
   return info;
 }

 cl::BufferDescriptor::MemoryLayout cl::BufferDescriptor::MemoryLayoutFromScope(
     ffi::Optional<ffi::String> mem_scope) {
   if (!mem_scope.has_value()) {
     return cl::BufferDescriptor::MemoryLayout::kBuffer1D;
   } else if (mem_scope.value() == "global.texture") {
     return cl::BufferDescriptor::MemoryLayout::kImage2DActivation;
   } else if (mem_scope.value() == "global.texture-weight") {
     return cl::BufferDescriptor::MemoryLayout::kImage2DWeight;
   } else if (mem_scope.value() == "global.texture-nhwc") {
     return cl::BufferDescriptor::MemoryLayout::kImage2DNHWC;
   }
   LOG(FATAL) << "No memory layout defined for memory of scope: " << mem_scope.value();
 }

 ffi::String cl::BufferDescriptor::ScopeFromMemoryLayout(cl::BufferDescriptor::MemoryLayout layout) {
   switch (layout) {
     case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
       return "global";
     case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
       return "global.texture";
     case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
       return "global.texture-weight";
     case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC:
       return "global.texture-nhwc";
   }
   LOG(FATAL) << "No scope corresponding to the provided memory layout: "
              << static_cast<int>(layout);
   return "";
 }

 static size_t GetMemObjectSize(Device dev, int ndim, const int64_t* shape, DLDataType dtype) {
   DLTensor temp;
   temp.data = nullptr;
   temp.device = dev;
   temp.ndim = ndim;
   temp.dtype = dtype;
   temp.shape = const_cast<int64_t*>(shape);
   temp.strides = nullptr;
   temp.byte_offset = 0;
   size_t size = DeviceAPI::Get(dev)->GetDataSize(temp);
   return size;
 }

 OpenCLThreadEntry* OpenCLWorkspace::GetThreadEntry() { return OpenCLThreadEntry::ThreadLocal(); }

 OpenCLWorkspace* OpenCLWorkspace::Global() {
   static OpenCLWorkspace* inst = new OpenCLWorkspace();
   return inst;
 }

 cl_device_id OpenCLWorkspace::GetCLDeviceID(int device_id) {
   this->Init();
   ICHECK_LT(device_id, devices.size()) << "Invalid device id " << device_id << ". " << GetError();
   return devices[device_id];
 }

 void OpenCLWorkspace::SetDevice(Device dev) { GetThreadEntry()->device.device_id = dev.device_id; }

 void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, ffi::Any* rv) {
   this->Init();
   size_t index = static_cast<size_t>(dev.device_id);
   if (kind == kExist) {
     *rv = static_cast<int>(index < devices.size());
     return;
   }
   cl_device_id device_id = GetCLDeviceID(index);
   switch (kind) {
     case kExist:
       break;
     case kMaxThreadsPerBlock: {
       size_t value;
       OPENCL_CALL(clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &value,
                                   nullptr));
       *rv = static_cast<int64_t>(value);
       break;
     }
     case kWarpSize: {
       /* TODO: the warp size of OpenCL device is not always 1
                e.g. Intel Graphics has a sub group concept which contains 8 - 32 work items,
                corresponding to the number of SIMD entries the heardware configures.
                We need to figure out a way to query this information from the hardware.
       */
       const int warp_size = dmlc::GetEnv("TVM_OPENCL_WARP_SIZE", 1);
       *rv = warp_size;
       break;
     }
     case kMaxSharedMemoryPerBlock: {
       cl_ulong value;
       OPENCL_CALL(
           clGetDeviceInfo(device_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &value, nullptr));
       *rv = static_cast<int64_t>(value);
       break;
     }
     case kComputeVersion:
       *rv = GetOpenCLVersion(device_id);
       break;
     case kDeviceName:
       *rv = GetDeviceInfo(device_id, CL_DEVICE_NAME);
       break;
     case kMaxClockRate: {
       cl_uint value;
       OPENCL_CALL(clGetDeviceInfo(device_id, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint), &value,
                                   nullptr));
       // OpenCL returns the clock rate in MHz, while CUDA/ROCm return the
       // clock rate in kHz.  Converting to the same units for each.
       *rv = static_cast<int32_t>(value * 1000);
       break;
     }
     case kMultiProcessorCount: {
       cl_uint value;
       OPENCL_CALL(clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &value,
                                   nullptr));
       *rv = static_cast<int32_t>(value);
       break;
     }
     case kMaxThreadDimensions: {
       size_t dims[3];
       OPENCL_CALL(
           clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(dims), dims, nullptr));

       std::stringstream ss;  // use json string to return multiple int values;
       ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]";
       *rv = ss.str();
       break;
     }
     case kMaxRegistersPerBlock:
       return;
     case kGcnArch:
       return;
     case kApiVersion: {
       *rv = CL_TARGET_OPENCL_VERSION;
       break;
     }
     case kDriverVersion: {
       char value[128] = {0};
       OPENCL_CALL(clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(value) - 1, value, nullptr));
       *rv = std::string(value);
       break;
     }
     case kL2CacheSizeBytes: {
       // NOTE(Zihao): this API cannot reflect the real L2 cache size in both CUDA/AMD GPUs.
       cl_ulong value;
       OPENCL_CALL(clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(value), &value,
                                   nullptr));
       *rv = static_cast<int64_t>(value);
       break;
     }
     case kTotalGlobalMemory: {
       cl_ulong total_global_memory;
       OPENCL_CALL(clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(total_global_memory),
                                   &total_global_memory, nullptr));
       *rv = static_cast<int64_t>(total_global_memory);
       return;
     }

     case kAvailableGlobalMemory:
       // Not currently implemented.  Based on
       // https://stackoverflow.com/a/3568223, may not be implementable
       // at all through OpenCL API.
       break;
     case kImagePitchAlignment: {
       *rv = static_cast<int64_t>(device_info[device_id].image_row_align);
       break;
     }
   }
 }

 void* OpenCLWorkspace::CreateHostPtrIfEnabled(cl::BufferDescriptor* desc, Device dev, size_t size) {
 #if defined(OPENCL_ENABLE_HOST_PTR)
   this->Init();
   cl_int err_code;
   desc->host_ptr = reinterpret_cast<cl_uchar*>(
       clEnqueueMapBuffer(this->GetQueue(dev), desc->buffer, CL_TRUE, CL_MAP_WRITE, 0,
                          sizeof(cl_uchar) * size, 0, nullptr, nullptr, &err_code));
   OPENCL_CHECK_ERROR(err_code);
 #endif  // OPENCL_ENABLE_HOST_PTR
   return desc;
 }

 void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,
                                       DLDataType type_hint) {
   this->Init();
   return AllocCLBuffer(dev, size, alignment, type_hint);
 }

 void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t width, size_t height, DLDataType type_hint,
                                       ffi::Optional<ffi::String> mem_scope) {
   // Texture allocation given width and height
   cl_uint row_align = GetImageAlignment(dev.device_id);
   size_t pixel_size = (type_hint.bits * type_hint.lanes + 7) / 8;
   size_t row_pitch = ALIGN_UP(width * pixel_size * 4, row_align);  // CL_RGBA = 4
   size_t mem_size = row_pitch * height;

   // Alloc back buffer from pool
   cl::BufferDescriptor* back_buffer = nullptr;
   if (IsBufferToImageSupported(dev.device_id)) {
     auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kPooled)
                    ->Alloc(dev, mem_size, kTempAllocaAlignment, type_hint);
     back_buffer = static_cast<cl::BufferDescriptor*>(buf.data);
     back_buffer->mbuf = buf;
   }

   if (!mem_scope.has_value()) {
     mem_scope = ffi::String("global.texture");
   }
   return AllocCLImage(dev, back_buffer, width, height, row_pitch, type_hint, mem_scope);
 }

 void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                                       ffi::Optional<ffi::String> mem_scope) {
   this->Init();
   if (!mem_scope.has_value() || (*mem_scope).empty() || (*mem_scope) == "global") {
     size_t size = GetMemObjectSize(dev, ndim, shape, dtype);
     cl::BufferDescriptor* ret_buffer = nullptr;
     auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kPooled)
                    ->Alloc(dev, size, kTempAllocaAlignment, dtype);
     ret_buffer = static_cast<cl::BufferDescriptor*>(buf.data);
     ret_buffer->mbuf = buf;
     return ret_buffer;
   }
   size_t axis = DefaultTextureLayoutSeparator(ndim, mem_scope.value());
   auto texture = ApplyTexture2DFlattening<int64_t>(shape, ndim, axis);

   return AllocDataSpace(dev, texture.width, texture.height, dtype, mem_scope);
 }

 void* OpenCLWorkspace::AllocCLBuffer(Device dev, size_t size, size_t alignment,
                                      DLDataType type_hint) {
   this->Init();
   cl_device_id device_id = GetCLDeviceID(dev.device_id);
   auto platform = device_info[device_id].platform_id;
   cl_int err_code;
   cl::BufferDescriptor* desc = new cl::BufferDescriptor;
   // CL_INVALID_BUFFER_SIZE if size is 0.
   if (size == 0) {
     size = 1;
   }
   desc->buffer =
       clCreateBuffer(this->contexts[platform], CL_MEM_CREATE_FLAGS, size, nullptr, &err_code);
   desc->layout = cl::BufferDescriptor::MemoryLayout::kBuffer1D;
   OPENCL_CHECK_ERROR(err_code);
   return CreateHostPtrIfEnabled(desc, dev, size);
 }

 void* OpenCLWorkspace::AllocCLImage(Device dev, void* back_buffer, size_t width, size_t height,
                                     size_t row_pitch, DLDataType type_hint,
                                     ffi::Optional<ffi::String> mem_scope) {
   this->Init();
   ICHECK(std::string(mem_scope.value()).find("texture") != std::string::npos)
       << "Expect texture scope while creating an Image object";
   cl::BufferDescriptor* back_desc = static_cast<cl::BufferDescriptor*>(back_buffer);
   cl_device_id device_id = GetCLDeviceID(dev.device_id);
   auto platform = device_info[device_id].platform_id;
   cl_int err_code;
   cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint);
   cl_image_format format = {CL_RGBA, cl_type};
   cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 0, 0, 0};

   if (IsBufferToImageSupported(dev.device_id)) {
     descriptor.image_row_pitch = row_pitch;
     descriptor.buffer = back_desc->buffer;
   }
   cl_mem mptr = clCreateImage(this->contexts[platform], CL_MEM_CREATE_FLAGS, &format, &descriptor,
                               nullptr, &err_code);
   OPENCL_CHECK_ERROR(err_code);

   cl::BufferDescriptor* desc = new cl::BufferDescriptor(mem_scope);
   desc->buffer = mptr;
   desc->back_buffer = back_desc;

   return desc;
 }

 size_t OpenCLWorkspace::GetDataSize(const DLTensor& arr, ffi::Optional<ffi::String> mem_scope) {
   if (!mem_scope.has_value() || (*mem_scope).empty() || (*mem_scope) == "global") {
     return DeviceAPI::GetDataSize(arr);
   }
   cl_uint row_align = GetImageAlignment(GetThreadEntry()->device.device_id);
   std::vector<int64_t> shape;
   shape.assign(arr.shape, arr.shape + arr.ndim);
   return runtime::GetTextureMemorySize<std::vector<int64_t>>(shape, arr.dtype.bits, arr.dtype.lanes,
                                                              mem_scope.value(), row_align);
 }

 void* OpenCLWorkspace::AllocDataSpaceView(Device dev, void* data, ffi::Shape shape,
                                           DLDataType dtype, ffi::Optional<ffi::String> mem_scope) {
   cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(data);

   // Fall back for devices w/o "cl_khr_image2d_from_buffer"
   if (!IsBufferToImageSupported(dev.device_id)) {
     cl::BufferDescriptor* ret_desc = desc;  // buffer -> buffer
     if (!mem_scope.has_value() || (*mem_scope).empty() || (*mem_scope) == "global") {
       if (desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
         // image -> buffer
         size_t nbytes = GetMemObjectSize(dev, shape.size(), shape.data(), dtype);
         ret_desc = static_cast<cl::BufferDescriptor*>(
             OpenCLWorkspace::AllocCLBuffer(dev, nbytes, kTempAllocaAlignment, dtype));
         ret_desc->is_compat_view = true;
       }
     } else {
       // Any -> Image
       size_t axis = DefaultTextureLayoutSeparator(shape.size(), mem_scope.value());
       auto texture = ApplyTexture2DFlattening<int64_t>(shape.data(), shape.size(), axis);
       cl_uint row_align = GetImageAlignment(dev.device_id);
       size_t pixel_size = (dtype.bits * dtype.lanes + 7) / 8;
       size_t row_pitch = ALIGN_UP(texture.width * pixel_size * 4, row_align);  // CL_RGBA = 4

       ret_desc = static_cast<cl::BufferDescriptor*>(OpenCLWorkspace::Global()->AllocCLImage(
           dev, nullptr, texture.width, texture.height, row_pitch, dtype, mem_scope));
       ret_desc->is_compat_view = true;
     }
     return ret_desc;
   }

   if (!mem_scope.has_value() || (*mem_scope).empty() || (*mem_scope) == "global") {
     if (desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
       //  buffer -> buffer
       return desc;
     } else {
       // image -> buffer
       return desc->back_buffer;
     }
   }
   size_t axis = DefaultTextureLayoutSeparator(shape.size(), mem_scope.value());
   auto texture = ApplyTexture2DFlattening<int64_t>(shape.data(), shape.size(), axis);
   cl_uint row_align = GetImageAlignment(dev.device_id);
   size_t pixel_size = (dtype.bits * dtype.lanes + 7) / 8;
   size_t row_pitch = ALIGN_UP(texture.width * pixel_size * 4, row_align);  // CL_RGBA = 4

   cl::BufferDescriptor* back_buffer;
   if (desc->back_buffer) {
     // image -> image
     back_buffer = desc->back_buffer;
   } else {
     // buffer -> image
     back_buffer = desc;
   }

   return (cl::BufferDescriptor*)AllocCLImage(dev, back_buffer, texture.width, texture.height,
                                              row_pitch, dtype, mem_scope);
 }

 void OpenCLWorkspace::FreeDataSpaceView(Device dev, void* ptr) {
   auto* desc = static_cast<const cl::BufferDescriptor*>(ptr);
   // Handle the fall back
   if (!IsBufferToImageSupported(dev.device_id)) {
     if (desc->is_compat_view) {
       OPENCL_CALL(clReleaseMemObject(desc->buffer));
       delete desc;
     }
     return;
   }

   if (desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
     OPENCL_CALL(clReleaseMemObject(desc->buffer));
     delete desc;
   }
 }

 void* OpenCLWorkspace::GetNativePtr(const tvm::runtime::Tensor& narr) {
   cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(narr.operator->()->data);
   return desc->host_ptr;
 }

 void OpenCLWorkspace::SetNativePtr(const tvm::runtime::Tensor& narr, void* host_ptr,
                                    size_t buf_size) {
   cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(narr.operator->()->data);

   this->Init();
   if (desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
 #ifdef USE_OPENCL_EXTN_QCOM
     Device dev = narr.operator->()->device;
     cl_device_id device_id = GetCLDeviceID(dev.device_id);
     auto platform = device_info[device_id].platform_id;

     if (desc->host_ptr) {
       OPENCL_CALL(clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer,
                                           reinterpret_cast<void*>(desc->host_ptr), 0, nullptr,
                                           nullptr));
       desc->host_ptr = nullptr;
     }
     OPENCL_CALL(clReleaseMemObject(desc->buffer));

     cl_int err_code;
     desc->buffer =
         clCreateBuffer(this->contexts[platform],
                        CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR | CL_MEM_EXT_HOST_PTR_QCOM, buf_size,
                        host_ptr, &err_code);
     desc->layout = cl::BufferDescriptor::MemoryLayout::kBuffer1D;
     OPENCL_CHECK_ERROR(err_code);
 #endif
   } else {
     LOG(FATAL) << "Native Ptr not enabled over image objects";
   }
 }

 void OpenCLWorkspace::SetPerfHint(Device dev, cl_uint perf_hint) {
 #ifdef CL_CONTEXT_PERF_HINT_QCOM
   cl_device_id device_id = GetCLDeviceID(dev.device_id);
   auto platform = device_info[device_id].platform_id;
   OPENCL_CALL(clSetPerfHintQCOM(this->contexts[platform], perf_hint));
 #endif
 }

 void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
   cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(ptr);
   if (desc->back_buffer) {
     // 2D Image w/ back buffer allocated from pool
     OPENCL_CALL(clReleaseMemObject(desc->buffer));
     MemoryManager::GetAllocator(dev, desc->back_buffer->mbuf.alloc_type)
         ->Free(desc->back_buffer->mbuf);
     delete desc;
   } else {
     if (desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
       // 1D buffer allocated from pool
       if (desc->host_ptr) {
         clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer,
                                 reinterpret_cast<void*>(desc->host_ptr), 0, nullptr, nullptr);
       }
       OPENCL_CALL(clReleaseMemObject(desc->buffer));
       delete desc;
     } else if (!IsBufferToImageSupported(dev.device_id)) {
       // 2D Image allocated w/o pool
       OPENCL_CALL(clReleaseMemObject(desc->buffer));
       delete desc;
       return;
     }
   }
 }

 void OpenCLWorkspace::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
   this->Init();
   size_t nbytes = GetDataSize(*from);
   ICHECK_EQ(nbytes, GetDataSize(*to));
   ICHECK(IsContiguous(*from) && IsContiguous(*to))
       << "CopyDataFromTo only support contiguous array for now";

   if (IsOpenCLDevice(from->device) && IsOpenCLDevice(to->device)) {
     const auto* from_desc = static_cast<const cl::BufferDescriptor*>(from->data);
     auto* to_desc = static_cast<cl::BufferDescriptor*>(to->data);
     if (to_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D &&
         from_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
       OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(to->device), from_desc->buffer,
                                       to_desc->buffer, from->byte_offset, to->byte_offset, nbytes,
                                       0, nullptr, nullptr));
     } else if (to_desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D &&
                from_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
       auto image_info = GetImageInfo(to_desc, to);
       OPENCL_CALL(clEnqueueCopyBufferToImage(this->GetQueue(to->device), from_desc->buffer,
                                              to_desc->buffer, from->byte_offset, image_info.origin,
                                              image_info.region, 0, nullptr, nullptr));
     } else if (to_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D &&
                from_desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
       auto image_info = GetImageInfo(from_desc, from);
       OPENCL_CALL(clEnqueueCopyImageToBuffer(this->GetQueue(to->device), from_desc->buffer,
                                              to_desc->buffer, image_info.origin, image_info.region,
                                              to->byte_offset, 0, nullptr, nullptr));
     } else {
       auto to_image_info = GetImageInfo(to_desc, to);
       auto from_image_info = GetImageInfo(from_desc, from);
       OPENCL_CALL(clEnqueueCopyImage(this->GetQueue(to->device), from_desc->buffer, to_desc->buffer,
                                      from_image_info.origin, to_image_info.origin,
                                      to_image_info.region, 0, nullptr, nullptr));
     }
   } else if (IsOpenCLDevice(from->device) && to->device.device_type == kDLCPU) {
     const auto* from_desc = static_cast<const cl::BufferDescriptor*>(from->data);
     switch (from_desc->layout) {
       case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
         OPENCL_CALL(clEnqueueReadBuffer(
             this->GetQueue(from->device), from_desc->buffer, CL_FALSE, from->byte_offset, nbytes,
             static_cast<char*>(to->data) + to->byte_offset, 0, nullptr, nullptr));
         break;
       case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
       case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
       case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC:
         auto image_info = GetImageInfo(from_desc, from);
         // TODO(csullivan): Support calculating row_pitch correctly in the case of reuse.
         // Note that when utilizing texture pools for memory reuse, the allocated image
         // size can be larger than the size to be read.
         OPENCL_CALL(clEnqueueReadImage(
             this->GetQueue(from->device), from_desc->buffer, CL_FALSE, image_info.origin,
             image_info.region, image_info.row_pitch, image_info.slice_pitch,
             static_cast<char*>(to->data) + to->byte_offset, 0, nullptr, nullptr));
         break;
     }
     OPENCL_CALL(clFinish(this->GetQueue(from->device)));
   } else if (from->device.device_type == kDLCPU && IsOpenCLDevice(to->device)) {
     auto* to_desc = static_cast<cl::BufferDescriptor*>(to->data);
     switch (to_desc->layout) {
       case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
         OPENCL_CALL(clEnqueueWriteBuffer(
             this->GetQueue(to->device), to_desc->buffer, CL_FALSE, to->byte_offset, nbytes,
             static_cast<const char*>(from->data) + from->byte_offset, 0, nullptr, nullptr));
         break;
       case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
       case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
       case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC:
         auto image_info = GetImageInfo(to_desc, to);
         OPENCL_CALL(clEnqueueWriteImage(
             this->GetQueue(to->device), to_desc->buffer, CL_FALSE, image_info.origin,
             image_info.region, image_info.row_pitch, image_info.slice_pitch,
             static_cast<const char*>(from->data) + from->byte_offset, 0, nullptr, nullptr));
         break;
     }
     OPENCL_CALL(clFinish(this->GetQueue(to->device)));
   } else {
     LOG(FATAL) << "Expect copy from/to OpenCL or between OpenCL";
   }
 }

 void OpenCLWorkspace::StreamSync(Device dev, TVMStreamHandle stream) {
   this->Init();
   ICHECK(stream == nullptr);
   OPENCL_CALL(clFinish(this->GetQueue(dev)));
 }

 void* OpenCLWorkspace::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
   this->Init();
   cl::BufferDescriptor* ret_buffer = nullptr;
   auto buf = MemoryManager::GetOrCreateAllocator(dev, AllocatorType::kPooled)
                  ->Alloc(dev, size, kTempAllocaAlignment, type_hint);
   ret_buffer = static_cast<cl::BufferDescriptor*>(buf.data);
   ret_buffer->mbuf = buf;
   return ret_buffer;
 }

 void OpenCLWorkspace::FreeWorkspace(Device dev, void* data) {
   cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(data);
   MemoryManager::GetAllocator(dev, desc->mbuf.alloc_type)->Free(desc->mbuf);
 }

 typedef dmlc::ThreadLocalStore<OpenCLThreadEntry> OpenCLThreadStore;

 OpenCLThreadEntry* OpenCLThreadEntry::ThreadLocal() { return OpenCLThreadStore::Get(); }

 std::string GetPlatformInfo(cl_platform_id pid, cl_platform_info param_name) {
   size_t ret_size;
   OPENCL_CALL(clGetPlatformInfo(pid, param_name, 0, nullptr, &ret_size));
   std::string ret;
   ret.resize(ret_size);
   OPENCL_CALL(clGetPlatformInfo(pid, param_name, ret_size, &ret[0], nullptr));
   return ret;
 }

 std::string GetDeviceInfo(cl_device_id pid, cl_device_info param_name) {
   size_t ret_size;
   OPENCL_CALL(clGetDeviceInfo(pid, param_name, 0, nullptr, &ret_size));
   char* info = new char[ret_size];
   OPENCL_CALL(clGetDeviceInfo(pid, param_name, ret_size, info, nullptr));
   std::string ret = info;
   delete[] info;
   return ret;
 }

 std::string GetOpenCLVersion(cl_device_id pid) {
   // ffi::String returned is "OpenCL $MAJOR.$MINOR $VENDOR_INFO".  To
   // match other implementations, we want to return "$MAJOR.$MINOR"
   std::string ret = GetDeviceInfo(pid, CL_DEVICE_VERSION);

   const size_t version_start = 7;  // Length of initial "OpenCL " prefix to skip
   const size_t version_end = ret.find(' ', version_start);
   return ret.substr(version_start, version_end - version_start);
 }

 std::vector<cl_platform_id> GetPlatformIDs() {
   cl_uint ret_size;
   cl_int code = clGetPlatformIDs(0, nullptr, &ret_size);
   std::vector<cl_platform_id> ret;
   if (code != CL_SUCCESS) return ret;
   ret.resize(ret_size);
   OPENCL_CALL(clGetPlatformIDs(ret_size, &ret[0], nullptr));
   return ret;
 }

 std::vector<cl_device_id> GetDeviceIDs(cl_platform_id pid, std::string device_type) {
   cl_device_type dtype = CL_DEVICE_TYPE_ALL;
   if (device_type == "cpu") dtype = CL_DEVICE_TYPE_CPU;
   if (device_type == "gpu") dtype = CL_DEVICE_TYPE_GPU;
   if (device_type == "accelerator") dtype = CL_DEVICE_TYPE_ACCELERATOR;
   cl_uint ret_size;
   cl_int code = clGetDeviceIDs(pid, dtype, 0, nullptr, &ret_size);
   std::vector<cl_device_id> ret;
   if (code != CL_SUCCESS) return ret;
   ret.resize(ret_size);
   OPENCL_CALL(clGetDeviceIDs(pid, dtype, ret_size, &ret[0], nullptr));
   return ret;
 }

 bool MatchPlatformInfo(cl_platform_id pid, cl_platform_info param_name, std::string value) {
   if (value.length() == 0) return true;
   std::string param_value = GetPlatformInfo(pid, param_name);
   return param_value.find(value) != std::string::npos;
 }

 void OpenCLWorkspace::Init(const std::string& type_key, const std::string& device_type,
                            const std::string& platform_name, cl_context_properties ctx_props[]) {
   if (initialized_) return;
   std::lock_guard<std::mutex> lock(this->mu);
   if (initialized_) return;
   this->type_key = type_key;
   // matched platforms
   std::vector<cl_platform_id> platform_ids = cl::GetPlatformIDs();
   if (platform_ids.size() == 0) {
     LOG(WARNING) << "No OpenCL platform matched given existing options ...";
     return;
   }
   auto find_opencl_device = [&](const std::string& device_type, const std::string& platform_name) {
     std::unordered_map<cl_platform_id, std::vector<cl_device_id>> device_map;
     for (auto platform_id : platform_ids) {
       if (!MatchPlatformInfo(platform_id, CL_PLATFORM_NAME, platform_name)) {
         continue;
       }
       std::vector<cl_device_id> devices_matched = cl::GetDeviceIDs(platform_id, device_type);
       std::vector<cl_device_id> supported_devices = {};
       auto get_version_str = [](int version) {
         std::ostringstream out;
         out.precision(1);
         out << std::fixed << version / 100.f;
         return out.str();
       };
       for (auto& device : devices_matched) {
         std::string ver = GetOpenCLVersion(device);
         int opencl_version = std::stod(ver) * 100;
         if (opencl_version >= CL_TARGET_OPENCL_VERSION) {
           supported_devices.push_back(device);
         } else {
           std::string dev_msg = GetDeviceInfo(device, CL_DEVICE_NAME) +
                                 " has OpenCL version == " + get_version_str(opencl_version);
           LOG(WARNING) << "TVM supports devices with OpenCL version >= "
                        << get_version_str(CL_TARGET_OPENCL_VERSION) << ", device " << dev_msg
                        << ". This device will be ignored.";

           if (noDevicesErrorMsg.empty()) {
             noDevicesErrorMsg =
                 "Probably this error happen because TVM supports devices with OpenCL version >= " +
                 get_version_str(CL_TARGET_OPENCL_VERSION) + ". We found the following devices:\n";
           }
           noDevicesErrorMsg += "\t" + dev_msg + "\n";
         }
       }
       if (supported_devices.size()) {
         device_map[platform_id] = supported_devices;
       }
     }
     return device_map;
   };
   auto device_map = find_opencl_device(device_type, platform_name);
   if ((device_map.size() == 0) && (device_type == "gpu")) {
     LOG(WARNING) << "Using CPU OpenCL device";
     device_map = find_opencl_device("cpu", "");
   }
   if (device_map.empty()) {
     LOG(WARNING) << "No OpenCL device";
     initialized_ = true;
     return;
   }
   ICHECK_EQ(this->queues.size(), 0U);
   cl_int err_code;
   for (auto& [platform, devices] : device_map) {
     this->platform_ids.push_back(platform);
     this->contexts[platform] =
         clCreateContext(ctx_props, devices.size(), &(devices[0]), nullptr, nullptr, &err_code);
     this->devices.insert(this->devices.end(), devices.begin(), devices.end());
     for (size_t i = 0; i < devices.size(); ++i) {
       cl_device_id did = devices[i];
       CLDeviceInfo dev_info;
       dev_info.platform_id = platform;
       this->queues.push_back(clCreateCommandQueue(this->contexts[platform], did, 0, &err_code));
       OPENCL_CHECK_ERROR(err_code);
       cl_uint row_pitch;
       OPENCL_CALL(clGetDeviceInfo(did, CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR, sizeof(row_pitch),
                                   &row_pitch, nullptr));
       if (0 == row_pitch) {
         row_pitch = kAllocAlignment;  // Fallback
       }
       dev_info.image_row_align = row_pitch;
       dev_info.image_from_buffer_support =
           IsOpenCLExtensionSupported(did, "cl_khr_image2d_from_buffer");
       device_info.insert({did, dev_info});
     }
     OPENCL_CHECK_ERROR(err_code);
   }
   this->events.resize(this->devices.size());
   initialized_ = true;
 }

 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def_packed("device_api.opencl.alloc_nd",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
                     int32_t device_type = args[0].cast<int32_t>();
                     int32_t device_id = args[1].cast<int32_t>();
                     int32_t dtype_code_hint = args[2].cast<int32_t>();
                     int32_t dtype_bits_hint = args[3].cast<int32_t>();
                     auto scope = args[4].cast<std::string>();
                     CHECK(scope.find("texture") != std::string::npos);
                     int64_t ndim = args[5].cast<int64_t>();
                     CHECK_EQ(ndim, 2);
                     int64_t* shape = static_cast<int64_t*>(args[6].cast<void*>());
                     int64_t width = shape[0];
                     int64_t height = shape[1];

                     Device dev;
                     dev.device_type = static_cast<DLDeviceType>(device_type);
                     dev.device_id = device_id;

                     DLDataType type_hint;
                     type_hint.code = static_cast<decltype(type_hint.code)>(dtype_code_hint);
                     type_hint.bits = static_cast<decltype(type_hint.bits)>(dtype_bits_hint);
                     type_hint.lanes = 1;

                     *rv = OpenCLWorkspace::Global()->AllocDataSpace(
                         dev, static_cast<size_t>(width), static_cast<size_t>(height), type_hint,
                         ffi::String("global.texture"));
                   })
       .def_packed("device_api.opencl.free_nd",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
                     int32_t device_type = args[0].cast<int32_t>();
                     int32_t device_id = args[1].cast<int32_t>();
                     auto scope = args[2].cast<std::string>();
                     CHECK(scope.find("texture") != std::string::npos);
                     void* data = args[3].cast<void*>();
                     OpenCLWorkspace* ptr = OpenCLWorkspace::Global();
                     Device dev;
                     dev.device_type = static_cast<DLDeviceType>(device_type);
                     dev.device_id = device_id;
                     ptr->FreeDataSpace(dev, data);
                     *rv = static_cast<int32_t>(0);
                   })
       .def_packed("device_api.opencl", [](ffi::PackedArgs args, ffi::Any* rv) {
         DeviceAPI* ptr = OpenCLWorkspace::Global();
         *rv = static_cast<void*>(ptr);
       });
 }

 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("profiling.timer.opencl",
                         [](Device dev) { return Timer(ffi::make_object<OpenCLTimerNode>(dev)); });
 }

 class OpenCLPooledAllocator final : public memory::PooledAllocator {
  public:
   explicit OpenCLPooledAllocator() : PooledAllocator() {}

   bool AllowMemoryScope(const std::string& mem_scope) const final {
     return ((mem_scope.find("texture") != std::string::npos) || mem_scope.empty() ||
             ("global" == mem_scope));
   }

   Buffer Alloc(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) override {
     std::lock_guard<std::recursive_mutex> lock(mu_);
     size_t size = ((nbytes + page_size_ - 1) / page_size_) * page_size_;
     auto&& it = memory_pool_.find(size);
     if (it != memory_pool_.end() && !it->second.empty()) {
       auto&& pool = it->second;
       auto ret = pool.back();
       pool.pop_back();
       return ret;
     }
     Buffer buf;
     buf.device = dev;
     buf.size = size;
     buf.alloc_type = AllocatorType::kPooled;
     try {
       buf.data = DeviceAllocDataSpace(dev, size, alignment, type_hint);
     } catch (InternalError& err) {
       LOG(WARNING) << "PooledAllocator got InternalError during allocation: " << err.what();
       LOG(WARNING) << "Trying to release all unused memory and reallocate...";
       ReleaseAll();
       buf.data = DeviceAllocDataSpace(dev, size, alignment, type_hint);
     }

     used_memory_.fetch_add(size, std::memory_order_relaxed);
     VLOG(1) << "allocate " << size << " B, used memory " << used_memory_ << " B";
     return buf;
   }

   Buffer Alloc(Device dev, ffi::Shape shape, DLDataType type_hint,
                const std::string& mem_scope) override {
     if (AllowMemoryScope(mem_scope)) {
       size_t size = ffi::GetDataSize(shape.Product(), type_hint);
       Buffer buf;
       buf.device = dev;
       buf.size = size;
       buf.alloc_type = AllocatorType::kPooled;
       buf.data = DeviceAPI::Get(dev)->AllocDataSpace(dev, shape.size(), shape.data(), type_hint,
                                                      ffi::String(mem_scope));
       if (mem_scope.find("texture") == std::string::npos) {
         // All textures are backed by buffers - don't count in total memory
         used_memory_.fetch_add(size, std::memory_order_relaxed);
       }
       DLOG(INFO) << "allocate " << size << " B, used memory " << used_memory_ << " B";
       return buf;
     }
     LOG(FATAL) << "Unsupported memory scope for this Allocator:" << mem_scope;
     return {};
   }

   void Free(const Buffer& buffer) override {
     std::lock_guard<std::recursive_mutex> lock(mu_);
     if (memory_pool_.find(buffer.size) == memory_pool_.end()) {
       memory_pool_.emplace(buffer.size, std::vector<Buffer>{});
     }
     memory_pool_.at(buffer.size).push_back(buffer);
     VLOG(1) << "reclaim buffer " << buffer.size;
   }

   void* CreateView(const Buffer& buffer, ffi::Shape shape, DLDataType type_hint,
                    const std::string& mem_scope) final {
     OpenCLWorkspace* ws_ = OpenCLWorkspace::Global();
     return ws_->AllocDataSpaceView(buffer.device, buffer.data, shape, type_hint,
                                    ffi::String(mem_scope));
   }

   void FreeView(Device dev, void* data) final {
     OpenCLWorkspace* ws_ = OpenCLWorkspace::Global();
     return ws_->FreeDataSpaceView(dev, data);
   }
 };

 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def_packed("DeviceAllocator.opencl", [](ffi::PackedArgs args, ffi::Any* rv) {
     Allocator* alloc = new OpenCLPooledAllocator();
     *rv = static_cast<void*>(alloc);
   });
 }

 }  // namespace cl
 size_t OpenCLTimerNode::count_timer_execs = 0;
 std::vector<size_t> OpenCLTimerNode::event_start_idxs;
 }  // namespace runtime
 }  // namespace tvm