src/runtime/opencl/opencl_device_api.cc - tvm - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file opencl_device_api.cc
  */
 #include <dmlc/parameter.h>
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/profiling.h>
 #include <tvm/runtime/registry.h>

 #include <sstream>

 #include "opencl_common.h"

 namespace tvm {
 namespace runtime {
 namespace cl {

 std::string GetPlatformInfo(cl_platform_id pid, cl_platform_info param_name);
 std::string GetDeviceInfo(cl_device_id pid, cl_device_info param_name);
 std::string GetOpenCLVersion(cl_device_id pid);

 struct ImageInfo {
   size_t origin[3] = {};
   size_t region[3] = {};
   size_t row_pitch = 0;
   size_t slice_pitch = 0;
 };

 /*!
  * \brief Utility to apply a memory layout specific lowering convention
  * to infer the physical shape from the provided DLTensor's logical shape.
  * \param desc Descriptor which contains the buffer and layout tag.
  * \param The DLTensor used to infer the tensors physical shape.
  */
 ImageInfo GetImageInfo(const cl::BufferDescriptor* desc, const DLTensor* tensor) {
   ImageInfo info{};
   ICHECK(tensor->dtype.lanes == 1) << "Image dtype has lanes: " << tensor->dtype.lanes;

   info.origin[0] = info.origin[1] = info.origin[2] = 0;
   info.row_pitch = 0;
   info.slice_pitch = 0;

   size_t axis = DefaultTextureLayoutSeparator(
       tensor->ndim, cl::BufferDescriptor::ScopeFromMemoryLayout(desc->layout));
   auto texture_shape = ApplyTexture2DFlattening<int64_t>(tensor->shape, tensor->ndim, axis);
   info.region[0] = texture_shape.width;
   info.region[1] = texture_shape.height;
   info.region[2] = 1;
   return info;
 }

 cl::BufferDescriptor::MemoryLayout cl::BufferDescriptor::MemoryLayoutFromScope(
     Optional<String> mem_scope) {
   if (!mem_scope.defined()) {
     return cl::BufferDescriptor::MemoryLayout::kBuffer1D;
   } else if (mem_scope.value() == "global.texture") {
     return cl::BufferDescriptor::MemoryLayout::kImage2DActivation;
   } else if (mem_scope.value() == "global.texture-weight") {
     return cl::BufferDescriptor::MemoryLayout::kImage2DWeight;
   } else if (mem_scope.value() == "global.texture-nhwc") {
     return cl::BufferDescriptor::MemoryLayout::kImage2DNHWC;
   }
   LOG(FATAL) << "No memory layout defined for memory of scope: " << mem_scope.value();
 }

 String cl::BufferDescriptor::ScopeFromMemoryLayout(cl::BufferDescriptor::MemoryLayout layout) {
   switch (layout) {
     case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
       return "global";
     case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
       return "global.texture";
     case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
       return "global.texture-weight";
     case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC:
       return "global.texture-nhwc";
   }
   LOG(FATAL) << "No scope corresponding to the provided memory layout: "
              << static_cast<int>(layout);
   return "";
 }

 OpenCLThreadEntry* OpenCLWorkspace::GetThreadEntry() { return OpenCLThreadEntry::ThreadLocal(); }

 OpenCLWorkspace* OpenCLWorkspace::Global() {
   static OpenCLWorkspace* inst = new OpenCLWorkspace();
   return inst;
 }

 void OpenCLWorkspace::SetDevice(Device dev) { GetThreadEntry()->device.device_id = dev.device_id; }

 void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) {
   this->Init();
   size_t index = static_cast<size_t>(dev.device_id);
   if (kind == kExist) {
     *rv = static_cast<int>(index < devices.size());
     return;
   }
   ICHECK_LT(index, devices.size()) << "Invalid device id " << index << ". " << GetError();
   switch (kind) {
     case kExist:
       break;
     case kMaxThreadsPerBlock: {
       size_t value;
       OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t),
                                   &value, nullptr));
       *rv = static_cast<int64_t>(value);
       break;
     }
     case kWarpSize: {
       /* TODO: the warp size of OpenCL device is not always 1
                e.g. Intel Graphics has a sub group concept which contains 8 - 32 work items,
                corresponding to the number of SIMD entries the heardware configures.
                We need to figure out a way to query this information from the hardware.
       */
       const int warp_size = dmlc::GetEnv("TVM_OPENCL_WARP_SIZE", 1);
       *rv = warp_size;
       break;
     }
     case kMaxSharedMemoryPerBlock: {
       cl_ulong value;
       OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong),
                                   &value, nullptr));
       *rv = static_cast<int64_t>(value);
       break;
     }
     case kComputeVersion:
       *rv = GetOpenCLVersion(devices[index]);
       break;
     case kDeviceName:
       *rv = GetDeviceInfo(devices[index], CL_DEVICE_NAME);
       break;
     case kMaxClockRate: {
       cl_uint value;
       OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint),
                                   &value, nullptr));
       // OpenCL returns the clock rate in MHz, while CUDA/ROCm return the
       // clock rate in kHz.  Converting to the same units for each.
       *rv = static_cast<int32_t>(value * 1000);
       break;
     }
     case kMultiProcessorCount: {
       cl_uint value;
       OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
                                   &value, nullptr));
       *rv = static_cast<int32_t>(value);
       break;
     }
     case kMaxThreadDimensions: {
       size_t dims[3];
       OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(dims), dims,
                                   nullptr));

       std::stringstream ss;  // use json string to return multiple int values;
       ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]";
       *rv = ss.str();
       break;
     }
     case kMaxRegistersPerBlock:
       return;
     case kGcnArch:
       return;
     case kApiVersion: {
       *rv = CL_TARGET_OPENCL_VERSION;
       break;
     }
     case kDriverVersion: {
       char value[128] = {0};
       OPENCL_CALL(
           clGetDeviceInfo(devices[index], CL_DRIVER_VERSION, sizeof(value) - 1, value, nullptr));
       *rv = std::string(value);
       break;
     }
   }
 }

 void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,
                                       DLDataType type_hint) {
   this->Init();
   ICHECK(context != nullptr) << "No OpenCL device. " << GetError();
   cl_int err_code;
   cl::BufferDescriptor* desc = new cl::BufferDescriptor;
   // CL_INVALID_BUFFER_SIZE if size is 0.
   if (size == 0) {
     size = 1;
   }
   desc->buffer = clCreateBuffer(this->context, CL_MEM_READ_WRITE, size, nullptr, &err_code);
   desc->layout = cl::BufferDescriptor::MemoryLayout::kBuffer1D;
   OPENCL_CHECK_ERROR(err_code);
   return desc;
 }

 void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                                       Optional<String> mem_scope) {
   if (!mem_scope.defined() || mem_scope.value() == "global") {
     return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
   }
   ICHECK(IsTextureStorage(std::string(mem_scope.value())))
       << "Device does not support allocate data space with "
       << "specified memory scope: " << mem_scope.value();

   ICHECK(ndim > 2) << "Shape for texture allocation must be at least rank 3; "
                    << "provided shape is rank " << ndim;

   cl::BufferDescriptor* desc = new cl::BufferDescriptor(mem_scope);
   size_t axis = DefaultTextureLayoutSeparator(ndim, mem_scope.value());
   auto texture = ApplyTexture2DFlattening<int64_t>(shape, ndim, axis);
   desc->buffer = AllocTexture(dev, texture.width, texture.height, dtype);
   return desc;
 }

 void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
   // We have to make sure that the memory object is not in the command queue
   // for some OpenCL platforms.
   OPENCL_CALL(clFinish(this->GetQueue(dev)));

   cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(ptr);
   OPENCL_CALL(clReleaseMemObject(desc->buffer));
   delete desc;
 }

 cl_mem OpenCLWorkspace::AllocTexture(Device dev, size_t width, size_t height,
                                      DLDataType type_hint) {
   this->Init();
   ICHECK(context != nullptr) << "No OpenCL device. " << GetError();
   cl_int err_code;
   cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint);
   cl_image_format format = {CL_RGBA, cl_type};
   cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 0, 0, 0};
   cl_mem mptr =
       clCreateImage(this->context, CL_MEM_READ_WRITE, &format, &descriptor, nullptr, &err_code);
   OPENCL_CHECK_ERROR(err_code);
   return mptr;
 }

 void* OpenCLWorkspace::AllocTextureWorkspace(Device dev, size_t width, size_t height,
                                              DLDataType type_hint) {
   return GetThreadEntry()->texture_pool.AllocTexture(dev, width, height, type_hint);
 }

 void OpenCLWorkspace::FreeTextureWorkspace(Device dev, void* ptr) {
   GetThreadEntry()->texture_pool.FreeTexture(dev, ptr);
 }

 void OpenCLWorkspace::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
   size_t nbytes = GetDataSize(*from);
   ICHECK_EQ(nbytes, GetDataSize(*to));
   ICHECK(IsContiguous(*from) && IsContiguous(*to))
       << "CopyDataFromTo only support contiguous array for now";

   if (IsOpenCLDevice(from->device) && IsOpenCLDevice(to->device)) {
     const auto* from_desc = static_cast<const cl::BufferDescriptor*>(from->data);
     auto* to_desc = static_cast<cl::BufferDescriptor*>(to->data);
     if (to_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D &&
         from_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
       OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(to->device), from_desc->buffer,
                                       to_desc->buffer, from->byte_offset, to->byte_offset, nbytes,
                                       0, nullptr, nullptr));
     } else if (to_desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D &&
                from_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
       auto image_info = GetImageInfo(to_desc, to);
       OPENCL_CALL(clEnqueueCopyBufferToImage(this->GetQueue(to->device), from_desc->buffer,
                                              to_desc->buffer, from->byte_offset, image_info.origin,
                                              image_info.region, 0, nullptr, nullptr));
     } else if (to_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D &&
                from_desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
       auto image_info = GetImageInfo(from_desc, from);
       OPENCL_CALL(clEnqueueCopyImageToBuffer(this->GetQueue(to->device), from_desc->buffer,
                                              to_desc->buffer, image_info.origin, image_info.region,
                                              to->byte_offset, 0, nullptr, nullptr));
     } else {
       auto to_image_info = GetImageInfo(to_desc, to);
       auto from_image_info = GetImageInfo(from_desc, from);
       OPENCL_CALL(clEnqueueCopyImage(this->GetQueue(to->device), from_desc->buffer, to_desc->buffer,
                                      from_image_info.origin, to_image_info.origin,
                                      to_image_info.region, 0, nullptr, nullptr));
     }
   } else if (IsOpenCLDevice(from->device) && to->device.device_type == kDLCPU) {
     const auto* from_desc = static_cast<const cl::BufferDescriptor*>(from->data);
     switch (from_desc->layout) {
       case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
         OPENCL_CALL(clEnqueueReadBuffer(
             this->GetQueue(from->device), from_desc->buffer, CL_FALSE, from->byte_offset, nbytes,
             static_cast<char*>(to->data) + to->byte_offset, 0, nullptr, nullptr));
         break;
       case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
       case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
       case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC:
         auto image_info = GetImageInfo(from_desc, from);
         // TODO(csullivan): Support calculating row_pitch correctly in the case of reuse.
         // Note that when utilizing texture pools for memory reuse, the allocated image
         // size can be larger than the size to be read.
         OPENCL_CALL(clEnqueueReadImage(
             this->GetQueue(from->device), from_desc->buffer, CL_FALSE, image_info.origin,
             image_info.region, image_info.row_pitch, image_info.slice_pitch,
             static_cast<char*>(to->data) + to->byte_offset, 0, nullptr, nullptr));
         break;
     }
     OPENCL_CALL(clFinish(this->GetQueue(from->device)));
   } else if (from->device.device_type == kDLCPU && IsOpenCLDevice(to->device)) {
     auto* to_desc = static_cast<cl::BufferDescriptor*>(to->data);
     switch (to_desc->layout) {
       case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
         OPENCL_CALL(clEnqueueWriteBuffer(
             this->GetQueue(to->device), to_desc->buffer, CL_FALSE, to->byte_offset, nbytes,
             static_cast<const char*>(from->data) + from->byte_offset, 0, nullptr, nullptr));
         break;
       case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
       case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
       case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC:
         auto image_info = GetImageInfo(to_desc, to);
         OPENCL_CALL(clEnqueueWriteImage(
             this->GetQueue(to->device), to_desc->buffer, CL_FALSE, image_info.origin,
             image_info.region, image_info.row_pitch, image_info.slice_pitch,
             static_cast<const char*>(from->data) + from->byte_offset, 0, nullptr, nullptr));
         break;
     }
     OPENCL_CALL(clFinish(this->GetQueue(to->device)));
   } else {
     LOG(FATAL) << "Expect copy from/to OpenCL or between OpenCL";
   }
 }

 void OpenCLWorkspace::StreamSync(Device dev, TVMStreamHandle stream) {
   ICHECK(stream == nullptr);
   OPENCL_CALL(clFinish(this->GetQueue(dev)));
 }

 void* OpenCLWorkspace::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
   return GetThreadEntry()->pool.AllocWorkspace(dev, size);
 }

 void OpenCLWorkspace::FreeWorkspace(Device dev, void* data) {
   GetThreadEntry()->pool.FreeWorkspace(dev, data);
 }

 typedef dmlc::ThreadLocalStore<OpenCLThreadEntry> OpenCLThreadStore;

 OpenCLThreadEntry* OpenCLThreadEntry::ThreadLocal() { return OpenCLThreadStore::Get(); }

 std::string GetPlatformInfo(cl_platform_id pid, cl_platform_info param_name) {
   size_t ret_size;
   OPENCL_CALL(clGetPlatformInfo(pid, param_name, 0, nullptr, &ret_size));
   std::string ret;
   ret.resize(ret_size);
   OPENCL_CALL(clGetPlatformInfo(pid, param_name, ret_size, &ret[0], nullptr));
   return ret;
 }

 std::string GetDeviceInfo(cl_device_id pid, cl_device_info param_name) {
   size_t ret_size;
   OPENCL_CALL(clGetDeviceInfo(pid, param_name, 0, nullptr, &ret_size));
   char* info = new char[ret_size];
   OPENCL_CALL(clGetDeviceInfo(pid, param_name, ret_size, info, nullptr));
   std::string ret = info;
   delete[] info;
   return ret;
 }

 std::string GetOpenCLVersion(cl_device_id pid) {
   // String returned is "OpenCL $MAJOR.$MINOR $VENDOR_INFO".  To
   // match other implementations, we want to return "$MAJOR.$MINOR"
   std::string ret = GetDeviceInfo(pid, CL_DEVICE_VERSION);

   const size_t version_start = 7;  // Length of initial "OpenCL " prefix to skip
   const size_t version_end = ret.find(' ', version_start);
   return ret.substr(version_start, version_end - version_start);
 }

 std::vector<cl_platform_id> GetPlatformIDs() {
   cl_uint ret_size;
   cl_int code = clGetPlatformIDs(0, nullptr, &ret_size);
   std::vector<cl_platform_id> ret;
   if (code != CL_SUCCESS) return ret;
   ret.resize(ret_size);
   OPENCL_CALL(clGetPlatformIDs(ret_size, &ret[0], nullptr));
   return ret;
 }

 std::vector<cl_device_id> GetDeviceIDs(cl_platform_id pid, std::string device_type) {
   cl_device_type dtype = CL_DEVICE_TYPE_ALL;
   if (device_type == "cpu") dtype = CL_DEVICE_TYPE_CPU;
   if (device_type == "gpu") dtype = CL_DEVICE_TYPE_GPU;
   if (device_type == "accelerator") dtype = CL_DEVICE_TYPE_ACCELERATOR;
   cl_uint ret_size;
   cl_int code = clGetDeviceIDs(pid, dtype, 0, nullptr, &ret_size);
   std::vector<cl_device_id> ret;
   if (code != CL_SUCCESS) return ret;
   ret.resize(ret_size);
   OPENCL_CALL(clGetDeviceIDs(pid, dtype, ret_size, &ret[0], nullptr));
   return ret;
 }

 bool MatchPlatformInfo(cl_platform_id pid, cl_platform_info param_name, std::string value) {
   if (value.length() == 0) return true;
   std::string param_value = GetPlatformInfo(pid, param_name);
   return param_value.find(value) != std::string::npos;
 }

 void OpenCLWorkspace::Init(const std::string& type_key, const std::string& device_type,
                            const std::string& platform_name) {
   if (initialized_) return;
   std::lock_guard<std::mutex> lock(this->mu);
   if (initialized_) return;
   if (context != nullptr) return;
   this->type_key = type_key;
   // matched platforms
   std::vector<cl_platform_id> platform_ids = cl::GetPlatformIDs();
   if (platform_ids.size() == 0) {
     LOG(WARNING) << "No OpenCL platform matched given existing options ...";
     return;
   }
   this->platform_id = nullptr;
   for (auto platform_id : platform_ids) {
     if (!MatchPlatformInfo(platform_id, CL_PLATFORM_NAME, platform_name)) {
       continue;
     }
     std::vector<cl_device_id> devices_matched = cl::GetDeviceIDs(platform_id, device_type);
     if ((devices_matched.size() == 0) && (device_type == "gpu")) {
       LOG(WARNING) << "Using CPU OpenCL device";
       devices_matched = cl::GetDeviceIDs(platform_id, "cpu");
     }
     std::vector<cl_device_id> supported_devices = {};
     auto get_version_str = [](int version) {
       std::ostringstream out;
       out.precision(1);
       out << std::fixed << version / 100.f;
       return out.str();
     };
     for (auto& device : devices_matched) {
       std::string ver = GetOpenCLVersion(device);
       int opencl_version = std::stod(ver) * 100;
       if (opencl_version >= CL_TARGET_OPENCL_VERSION) {
         supported_devices.push_back(device);
       } else {
         std::string dev_msg = GetDeviceInfo(device, CL_DEVICE_NAME) +
                               " has OpenCL version == " + get_version_str(opencl_version);
         LOG(WARNING) << "TVM supports devices with OpenCL version >= "
                      << get_version_str(CL_TARGET_OPENCL_VERSION) << ", device " << dev_msg
                      << ". This device will be ignored.";

         if (noDevicesErrorMsg.empty()) {
           noDevicesErrorMsg =
               "Probably this error happen because TVM supports devices with OpenCL version >= " +
               get_version_str(CL_TARGET_OPENCL_VERSION) + ". We found the following devices:\n";
         }
         noDevicesErrorMsg += "\t" + dev_msg + "\n";
       }
     }
     if (supported_devices.size() > 0) {
       this->platform_id = platform_id;
       this->platform_name = cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME);
       this->device_type = device_type;
       this->devices = supported_devices;
       break;
     }
   }
   if (this->platform_id == nullptr) {
     LOG(WARNING) << "No OpenCL device";
     initialized_ = true;
     return;
   }
   cl_int err_code;
   this->context = clCreateContext(nullptr, this->devices.size(), &(this->devices[0]), nullptr,
                                   nullptr, &err_code);
   OPENCL_CHECK_ERROR(err_code);
   ICHECK_EQ(this->queues.size(), 0U);
   for (size_t i = 0; i < this->devices.size(); ++i) {
     cl_device_id did = this->devices[i];
     this->queues.push_back(clCreateCommandQueue(this->context, did, 0, &err_code));
     OPENCL_CHECK_ERROR(err_code);
   }
   this->events.resize(this->devices.size());
   initialized_ = true;
 }

 TVM_REGISTER_GLOBAL("device_api.opencl.alloc_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
   int32_t device_type = args[0];
   int32_t device_id = args[1];
   int32_t dtype_code_hint = args[2];
   int32_t dtype_bits_hint = args[3];
   std::string scope = args[4];
   CHECK(scope.find("texture") != std::string::npos);
   int64_t ndim = args[5];
   CHECK_EQ(ndim, 2);
   int64_t* shape = static_cast<int64_t*>(static_cast<void*>(args[6]));
   int64_t width = shape[0];
   int64_t height = shape[1];

   Device dev;
   dev.device_type = static_cast<DLDeviceType>(device_type);
   dev.device_id = device_id;

   DLDataType type_hint;
   type_hint.code = static_cast<decltype(type_hint.code)>(dtype_code_hint);
   type_hint.bits = static_cast<decltype(type_hint.bits)>(dtype_bits_hint);
   type_hint.lanes = 1;

   OpenCLWorkspace* ptr = OpenCLWorkspace::Global();
   *rv = ptr->AllocTextureWorkspace(dev, static_cast<size_t>(width), static_cast<size_t>(height),
                                    type_hint);
 });

 TVM_REGISTER_GLOBAL("device_api.opencl.free_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
   int32_t device_type = args[0];
   int32_t device_id = args[1];
   std::string scope = args[2];
   CHECK(scope.find("texture") != std::string::npos);
   void* data = args[3];
   OpenCLWorkspace* ptr = OpenCLWorkspace::Global();
   Device dev;
   dev.device_type = static_cast<DLDeviceType>(device_type);
   dev.device_id = device_id;
   ptr->FreeTextureWorkspace(dev, data);
   *rv = static_cast<int32_t>(0);
 });

 TVM_REGISTER_GLOBAL("device_api.opencl").set_body([](TVMArgs args, TVMRetValue* rv) {
   DeviceAPI* ptr = OpenCLWorkspace::Global();
   *rv = static_cast<void*>(ptr);
 });

 TVM_REGISTER_OBJECT_TYPE(OpenCLTimerNode);

 TVM_REGISTER_GLOBAL("profiling.timer.opencl").set_body_typed([](Device dev) {
   return Timer(make_object<OpenCLTimerNode>(dev));
 });

 }  // namespace cl
 size_t OpenCLTimerNode::count_timer_execs = 0;
 std::vector<size_t> OpenCLTimerNode::event_start_idxs;
 }  // namespace runtime
 }  // namespace tvm
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file opencl_device_api.cc
	*/
	#include <dmlc/parameter.h>
	#include <dmlc/thread_local.h>
	#include <tvm/runtime/profiling.h>
	#include <tvm/runtime/registry.h>

	#include <sstream>

	#include "opencl_common.h"

	namespace tvm {
	namespace runtime {
	namespace cl {

	std::string GetPlatformInfo(cl_platform_id pid, cl_platform_info param_name);
	std::string GetDeviceInfo(cl_device_id pid, cl_device_info param_name);
	std::string GetOpenCLVersion(cl_device_id pid);

	struct ImageInfo {
	size_t origin[3] = {};
	size_t region[3] = {};
	size_t row_pitch = 0;
	size_t slice_pitch = 0;
	};

	/*!
	* \brief Utility to apply a memory layout specific lowering convention
	* to infer the physical shape from the provided DLTensor's logical shape.
	* \param desc Descriptor which contains the buffer and layout tag.
	* \param The DLTensor used to infer the tensors physical shape.
	*/
	ImageInfo GetImageInfo(const cl::BufferDescriptor* desc, const DLTensor* tensor) {
	ImageInfo info{};
	ICHECK(tensor->dtype.lanes == 1) << "Image dtype has lanes: " << tensor->dtype.lanes;

	info.origin[0] = info.origin[1] = info.origin[2] = 0;
	info.row_pitch = 0;
	info.slice_pitch = 0;

	size_t axis = DefaultTextureLayoutSeparator(
	tensor->ndim, cl::BufferDescriptor::ScopeFromMemoryLayout(desc->layout));
	auto texture_shape = ApplyTexture2DFlattening<int64_t>(tensor->shape, tensor->ndim, axis);
	info.region[0] = texture_shape.width;
	info.region[1] = texture_shape.height;
	info.region[2] = 1;
	return info;
	}

	cl::BufferDescriptor::MemoryLayout cl::BufferDescriptor::MemoryLayoutFromScope(
	Optional<String> mem_scope) {
	if (!mem_scope.defined()) {
	return cl::BufferDescriptor::MemoryLayout::kBuffer1D;
	} else if (mem_scope.value() == "global.texture") {
	return cl::BufferDescriptor::MemoryLayout::kImage2DActivation;
	} else if (mem_scope.value() == "global.texture-weight") {
	return cl::BufferDescriptor::MemoryLayout::kImage2DWeight;
	} else if (mem_scope.value() == "global.texture-nhwc") {
	return cl::BufferDescriptor::MemoryLayout::kImage2DNHWC;
	}
	LOG(FATAL) << "No memory layout defined for memory of scope: " << mem_scope.value();
	}

	String cl::BufferDescriptor::ScopeFromMemoryLayout(cl::BufferDescriptor::MemoryLayout layout) {
	switch (layout) {
	case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
	return "global";
	case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
	return "global.texture";
	case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
	return "global.texture-weight";
	case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC:
	return "global.texture-nhwc";
	}
	LOG(FATAL) << "No scope corresponding to the provided memory layout: "
	<< static_cast<int>(layout);
	return "";
	}

	OpenCLThreadEntry* OpenCLWorkspace::GetThreadEntry() { return OpenCLThreadEntry::ThreadLocal(); }

	OpenCLWorkspace* OpenCLWorkspace::Global() {
	static OpenCLWorkspace* inst = new OpenCLWorkspace();
	return inst;
	}

	void OpenCLWorkspace::SetDevice(Device dev) { GetThreadEntry()->device.device_id = dev.device_id; }

	void OpenCLWorkspace::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) {
	this->Init();
	size_t index = static_cast<size_t>(dev.device_id);
	if (kind == kExist) {
	*rv = static_cast<int>(index < devices.size());
	return;
	}
	ICHECK_LT(index, devices.size()) << "Invalid device id " << index << ". " << GetError();
	switch (kind) {
	case kExist:
	break;
	case kMaxThreadsPerBlock: {
	size_t value;
	OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t),
	&value, nullptr));
	*rv = static_cast<int64_t>(value);
	break;
	}
	case kWarpSize: {
	/* TODO: the warp size of OpenCL device is not always 1
	e.g. Intel Graphics has a sub group concept which contains 8 - 32 work items,
	corresponding to the number of SIMD entries the heardware configures.
	We need to figure out a way to query this information from the hardware.
	*/
	const int warp_size = dmlc::GetEnv("TVM_OPENCL_WARP_SIZE", 1);
	*rv = warp_size;
	break;
	}
	case kMaxSharedMemoryPerBlock: {
	cl_ulong value;
	OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong),
	&value, nullptr));
	*rv = static_cast<int64_t>(value);
	break;
	}
	case kComputeVersion:
	*rv = GetOpenCLVersion(devices[index]);
	break;
	case kDeviceName:
	*rv = GetDeviceInfo(devices[index], CL_DEVICE_NAME);
	break;
	case kMaxClockRate: {
	cl_uint value;
	OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint),
	&value, nullptr));
	// OpenCL returns the clock rate in MHz, while CUDA/ROCm return the
	// clock rate in kHz. Converting to the same units for each.
	rv = static_cast<int32_t>(value 1000);
	break;
	}
	case kMultiProcessorCount: {
	cl_uint value;
	OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
	&value, nullptr));
	*rv = static_cast<int32_t>(value);
	break;
	}
	case kMaxThreadDimensions: {
	size_t dims[3];
	OPENCL_CALL(clGetDeviceInfo(devices[index], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(dims), dims,
	nullptr));

	std::stringstream ss; // use json string to return multiple int values;
	ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]";
	*rv = ss.str();
	break;
	}
	case kMaxRegistersPerBlock:
	return;
	case kGcnArch:
	return;
	case kApiVersion: {
	*rv = CL_TARGET_OPENCL_VERSION;
	break;
	}
	case kDriverVersion: {
	char value[128] = {0};
	OPENCL_CALL(
	clGetDeviceInfo(devices[index], CL_DRIVER_VERSION, sizeof(value) - 1, value, nullptr));
	*rv = std::string(value);
	break;
	}
	}
	}

	void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,
	DLDataType type_hint) {
	this->Init();
	ICHECK(context != nullptr) << "No OpenCL device. " << GetError();
	cl_int err_code;
	cl::BufferDescriptor* desc = new cl::BufferDescriptor;
	// CL_INVALID_BUFFER_SIZE if size is 0.
	if (size == 0) {
	size = 1;
	}
	desc->buffer = clCreateBuffer(this->context, CL_MEM_READ_WRITE, size, nullptr, &err_code);
	desc->layout = cl::BufferDescriptor::MemoryLayout::kBuffer1D;
	OPENCL_CHECK_ERROR(err_code);
	return desc;
	}

	void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
	Optional<String> mem_scope) {
	if (!mem_scope.defined() \|\| mem_scope.value() == "global") {
	return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
	}
	ICHECK(IsTextureStorage(std::string(mem_scope.value())))
	<< "Device does not support allocate data space with "
	<< "specified memory scope: " << mem_scope.value();

	ICHECK(ndim > 2) << "Shape for texture allocation must be at least rank 3; "
	<< "provided shape is rank " << ndim;

	cl::BufferDescriptor* desc = new cl::BufferDescriptor(mem_scope);
	size_t axis = DefaultTextureLayoutSeparator(ndim, mem_scope.value());
	auto texture = ApplyTexture2DFlattening<int64_t>(shape, ndim, axis);
	desc->buffer = AllocTexture(dev, texture.width, texture.height, dtype);
	return desc;
	}

	void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
	// We have to make sure that the memory object is not in the command queue
	// for some OpenCL platforms.
	OPENCL_CALL(clFinish(this->GetQueue(dev)));

	cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(ptr);
	OPENCL_CALL(clReleaseMemObject(desc->buffer));
	delete desc;
	}

	cl_mem OpenCLWorkspace::AllocTexture(Device dev, size_t width, size_t height,
	DLDataType type_hint) {
	this->Init();
	ICHECK(context != nullptr) << "No OpenCL device. " << GetError();
	cl_int err_code;
	cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint);
	cl_image_format format = {CL_RGBA, cl_type};
	cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 0, 0, 0};
	cl_mem mptr =
	clCreateImage(this->context, CL_MEM_READ_WRITE, &format, &descriptor, nullptr, &err_code);
	OPENCL_CHECK_ERROR(err_code);
	return mptr;
	}

	void* OpenCLWorkspace::AllocTextureWorkspace(Device dev, size_t width, size_t height,
	DLDataType type_hint) {
	return GetThreadEntry()->texture_pool.AllocTexture(dev, width, height, type_hint);
	}

	void OpenCLWorkspace::FreeTextureWorkspace(Device dev, void* ptr) {
	GetThreadEntry()->texture_pool.FreeTexture(dev, ptr);
	}

	void OpenCLWorkspace::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
	size_t nbytes = GetDataSize(*from);
	ICHECK_EQ(nbytes, GetDataSize(*to));
	ICHECK(IsContiguous(from) && IsContiguous(to))
	<< "CopyDataFromTo only support contiguous array for now";

	if (IsOpenCLDevice(from->device) && IsOpenCLDevice(to->device)) {
	const auto* from_desc = static_cast<const cl::BufferDescriptor*>(from->data);
	auto* to_desc = static_cast<cl::BufferDescriptor*>(to->data);
	if (to_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D &&
	from_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
	OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(to->device), from_desc->buffer,
	to_desc->buffer, from->byte_offset, to->byte_offset, nbytes,
	0, nullptr, nullptr));
	} else if (to_desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D &&
	from_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
	auto image_info = GetImageInfo(to_desc, to);
	OPENCL_CALL(clEnqueueCopyBufferToImage(this->GetQueue(to->device), from_desc->buffer,
	to_desc->buffer, from->byte_offset, image_info.origin,
	image_info.region, 0, nullptr, nullptr));
	} else if (to_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D &&
	from_desc->layout != cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
	auto image_info = GetImageInfo(from_desc, from);
	OPENCL_CALL(clEnqueueCopyImageToBuffer(this->GetQueue(to->device), from_desc->buffer,
	to_desc->buffer, image_info.origin, image_info.region,
	to->byte_offset, 0, nullptr, nullptr));
	} else {
	auto to_image_info = GetImageInfo(to_desc, to);
	auto from_image_info = GetImageInfo(from_desc, from);
	OPENCL_CALL(clEnqueueCopyImage(this->GetQueue(to->device), from_desc->buffer, to_desc->buffer,
	from_image_info.origin, to_image_info.origin,
	to_image_info.region, 0, nullptr, nullptr));
	}
	} else if (IsOpenCLDevice(from->device) && to->device.device_type == kDLCPU) {
	const auto* from_desc = static_cast<const cl::BufferDescriptor*>(from->data);
	switch (from_desc->layout) {
	case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
	OPENCL_CALL(clEnqueueReadBuffer(
	this->GetQueue(from->device), from_desc->buffer, CL_FALSE, from->byte_offset, nbytes,
	static_cast<char*>(to->data) + to->byte_offset, 0, nullptr, nullptr));
	break;
	case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
	case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
	case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC:
	auto image_info = GetImageInfo(from_desc, from);
	// TODO(csullivan): Support calculating row_pitch correctly in the case of reuse.
	// Note that when utilizing texture pools for memory reuse, the allocated image
	// size can be larger than the size to be read.
	OPENCL_CALL(clEnqueueReadImage(
	this->GetQueue(from->device), from_desc->buffer, CL_FALSE, image_info.origin,
	image_info.region, image_info.row_pitch, image_info.slice_pitch,
	static_cast<char*>(to->data) + to->byte_offset, 0, nullptr, nullptr));
	break;
	}
	OPENCL_CALL(clFinish(this->GetQueue(from->device)));
	} else if (from->device.device_type == kDLCPU && IsOpenCLDevice(to->device)) {
	auto* to_desc = static_cast<cl::BufferDescriptor*>(to->data);
	switch (to_desc->layout) {
	case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
	OPENCL_CALL(clEnqueueWriteBuffer(
	this->GetQueue(to->device), to_desc->buffer, CL_FALSE, to->byte_offset, nbytes,
	static_cast<const char*>(from->data) + from->byte_offset, 0, nullptr, nullptr));
	break;
	case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
	case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
	case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC:
	auto image_info = GetImageInfo(to_desc, to);
	OPENCL_CALL(clEnqueueWriteImage(
	this->GetQueue(to->device), to_desc->buffer, CL_FALSE, image_info.origin,
	image_info.region, image_info.row_pitch, image_info.slice_pitch,
	static_cast<const char*>(from->data) + from->byte_offset, 0, nullptr, nullptr));
	break;
	}
	OPENCL_CALL(clFinish(this->GetQueue(to->device)));
	} else {
	LOG(FATAL) << "Expect copy from/to OpenCL or between OpenCL";
	}
	}

	void OpenCLWorkspace::StreamSync(Device dev, TVMStreamHandle stream) {
	ICHECK(stream == nullptr);
	OPENCL_CALL(clFinish(this->GetQueue(dev)));
	}

	void* OpenCLWorkspace::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
	return GetThreadEntry()->pool.AllocWorkspace(dev, size);
	}

	void OpenCLWorkspace::FreeWorkspace(Device dev, void* data) {
	GetThreadEntry()->pool.FreeWorkspace(dev, data);
	}

	typedef dmlc::ThreadLocalStore<OpenCLThreadEntry> OpenCLThreadStore;

	OpenCLThreadEntry* OpenCLThreadEntry::ThreadLocal() { return OpenCLThreadStore::Get(); }

	std::string GetPlatformInfo(cl_platform_id pid, cl_platform_info param_name) {
	size_t ret_size;
	OPENCL_CALL(clGetPlatformInfo(pid, param_name, 0, nullptr, &ret_size));
	std::string ret;
	ret.resize(ret_size);
	OPENCL_CALL(clGetPlatformInfo(pid, param_name, ret_size, &ret[0], nullptr));
	return ret;
	}

	std::string GetDeviceInfo(cl_device_id pid, cl_device_info param_name) {
	size_t ret_size;
	OPENCL_CALL(clGetDeviceInfo(pid, param_name, 0, nullptr, &ret_size));
	char* info = new char[ret_size];
	OPENCL_CALL(clGetDeviceInfo(pid, param_name, ret_size, info, nullptr));
	std::string ret = info;
	delete[] info;
	return ret;
	}

	std::string GetOpenCLVersion(cl_device_id pid) {
	// String returned is "OpenCL $MAJOR.$MINOR $VENDOR_INFO". To
	// match other implementations, we want to return "$MAJOR.$MINOR"
	std::string ret = GetDeviceInfo(pid, CL_DEVICE_VERSION);

	const size_t version_start = 7; // Length of initial "OpenCL " prefix to skip
	const size_t version_end = ret.find(' ', version_start);
	return ret.substr(version_start, version_end - version_start);
	}

	std::vector<cl_platform_id> GetPlatformIDs() {
	cl_uint ret_size;
	cl_int code = clGetPlatformIDs(0, nullptr, &ret_size);
	std::vector<cl_platform_id> ret;
	if (code != CL_SUCCESS) return ret;
	ret.resize(ret_size);
	OPENCL_CALL(clGetPlatformIDs(ret_size, &ret[0], nullptr));
	return ret;
	}

	std::vector<cl_device_id> GetDeviceIDs(cl_platform_id pid, std::string device_type) {
	cl_device_type dtype = CL_DEVICE_TYPE_ALL;
	if (device_type == "cpu") dtype = CL_DEVICE_TYPE_CPU;
	if (device_type == "gpu") dtype = CL_DEVICE_TYPE_GPU;
	if (device_type == "accelerator") dtype = CL_DEVICE_TYPE_ACCELERATOR;
	cl_uint ret_size;
	cl_int code = clGetDeviceIDs(pid, dtype, 0, nullptr, &ret_size);
	std::vector<cl_device_id> ret;
	if (code != CL_SUCCESS) return ret;
	ret.resize(ret_size);
	OPENCL_CALL(clGetDeviceIDs(pid, dtype, ret_size, &ret[0], nullptr));
	return ret;
	}

	bool MatchPlatformInfo(cl_platform_id pid, cl_platform_info param_name, std::string value) {
	if (value.length() == 0) return true;
	std::string param_value = GetPlatformInfo(pid, param_name);
	return param_value.find(value) != std::string::npos;
	}

	void OpenCLWorkspace::Init(const std::string& type_key, const std::string& device_type,
	const std::string& platform_name) {
	if (initialized_) return;
	std::lock_guard<std::mutex> lock(this->mu);
	if (initialized_) return;
	if (context != nullptr) return;
	this->type_key = type_key;
	// matched platforms
	std::vector<cl_platform_id> platform_ids = cl::GetPlatformIDs();
	if (platform_ids.size() == 0) {
	LOG(WARNING) << "No OpenCL platform matched given existing options ...";
	return;
	}
	this->platform_id = nullptr;
	for (auto platform_id : platform_ids) {
	if (!MatchPlatformInfo(platform_id, CL_PLATFORM_NAME, platform_name)) {
	continue;
	}
	std::vector<cl_device_id> devices_matched = cl::GetDeviceIDs(platform_id, device_type);
	if ((devices_matched.size() == 0) && (device_type == "gpu")) {
	LOG(WARNING) << "Using CPU OpenCL device";
	devices_matched = cl::GetDeviceIDs(platform_id, "cpu");
	}
	std::vector<cl_device_id> supported_devices = {};
	auto get_version_str = [](int version) {
	std::ostringstream out;
	out.precision(1);
	out << std::fixed << version / 100.f;
	return out.str();
	};
	for (auto& device : devices_matched) {
	std::string ver = GetOpenCLVersion(device);
	int opencl_version = std::stod(ver) * 100;
	if (opencl_version >= CL_TARGET_OPENCL_VERSION) {
	supported_devices.push_back(device);
	} else {
	std::string dev_msg = GetDeviceInfo(device, CL_DEVICE_NAME) +
	" has OpenCL version == " + get_version_str(opencl_version);
	LOG(WARNING) << "TVM supports devices with OpenCL version >= "
	<< get_version_str(CL_TARGET_OPENCL_VERSION) << ", device " << dev_msg
	<< ". This device will be ignored.";

	if (noDevicesErrorMsg.empty()) {
	noDevicesErrorMsg =
	"Probably this error happen because TVM supports devices with OpenCL version >= " +
	get_version_str(CL_TARGET_OPENCL_VERSION) + ". We found the following devices:\n";
	}
	noDevicesErrorMsg += "\t" + dev_msg + "\n";
	}
	}
	if (supported_devices.size() > 0) {
	this->platform_id = platform_id;
	this->platform_name = cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME);
	this->device_type = device_type;
	this->devices = supported_devices;
	break;
	}
	}
	if (this->platform_id == nullptr) {
	LOG(WARNING) << "No OpenCL device";
	initialized_ = true;
	return;
	}
	cl_int err_code;
	this->context = clCreateContext(nullptr, this->devices.size(), &(this->devices[0]), nullptr,
	nullptr, &err_code);
	OPENCL_CHECK_ERROR(err_code);
	ICHECK_EQ(this->queues.size(), 0U);
	for (size_t i = 0; i < this->devices.size(); ++i) {
	cl_device_id did = this->devices[i];
	this->queues.push_back(clCreateCommandQueue(this->context, did, 0, &err_code));
	OPENCL_CHECK_ERROR(err_code);
	}
	this->events.resize(this->devices.size());
	initialized_ = true;
	}

	TVM_REGISTER_GLOBAL("device_api.opencl.alloc_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
	int32_t device_type = args[0];
	int32_t device_id = args[1];
	int32_t dtype_code_hint = args[2];
	int32_t dtype_bits_hint = args[3];
	std::string scope = args[4];
	CHECK(scope.find("texture") != std::string::npos);
	int64_t ndim = args[5];
	CHECK_EQ(ndim, 2);
	int64_t* shape = static_cast<int64_t>(static_cast<void>(args[6]));
	int64_t width = shape[0];
	int64_t height = shape[1];

	Device dev;
	dev.device_type = static_cast<DLDeviceType>(device_type);
	dev.device_id = device_id;

	DLDataType type_hint;
	type_hint.code = static_cast<decltype(type_hint.code)>(dtype_code_hint);
	type_hint.bits = static_cast<decltype(type_hint.bits)>(dtype_bits_hint);
	type_hint.lanes = 1;

	OpenCLWorkspace* ptr = OpenCLWorkspace::Global();
	*rv = ptr->AllocTextureWorkspace(dev, static_cast<size_t>(width), static_cast<size_t>(height),
	type_hint);
	});

	TVM_REGISTER_GLOBAL("device_api.opencl.free_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
	int32_t device_type = args[0];
	int32_t device_id = args[1];
	std::string scope = args[2];
	CHECK(scope.find("texture") != std::string::npos);
	void* data = args[3];
	OpenCLWorkspace* ptr = OpenCLWorkspace::Global();
	Device dev;
	dev.device_type = static_cast<DLDeviceType>(device_type);
	dev.device_id = device_id;
	ptr->FreeTextureWorkspace(dev, data);
	*rv = static_cast<int32_t>(0);
	});

	TVM_REGISTER_GLOBAL("device_api.opencl").set_body([](TVMArgs args, TVMRetValue* rv) {
	DeviceAPI* ptr = OpenCLWorkspace::Global();
	rv = static_cast<void>(ptr);
	});

	TVM_REGISTER_OBJECT_TYPE(OpenCLTimerNode);

	TVM_REGISTER_GLOBAL("profiling.timer.opencl").set_body_typed([](Device dev) {
	return Timer(make_object<OpenCLTimerNode>(dev));
	});

	} // namespace cl
	size_t OpenCLTimerNode::count_timer_execs = 0;
	std::vector<size_t> OpenCLTimerNode::event_start_idxs;
	} // namespace runtime
	} // namespace tvm