| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| /*! |
| * \file opencl_module.cc |
| */ |
| #include "opencl_module.h" |
| |
| #include <dmlc/memory_io.h> |
| #include <tvm/ffi/function.h> |
| #include <tvm/ffi/reflection/registry.h> |
| |
| #include <string> |
| #include <unordered_map> |
| #include <vector> |
| |
| #include "../source_utils.h" |
| #include "opencl_common.h" |
| |
| namespace tvm { |
| namespace runtime { |
| |
| class OpenCLWrappedFunc { |
| public: |
| // initialize the OpenCL function. |
| void Init(OpenCLModuleNodeBase* m, ObjectPtr<Object> sptr, OpenCLModuleNode::KTRefEntry entry, |
| std::string func_name, std::vector<size_t> arg_size, |
| const std::vector<std::string>& launch_param_tags) { |
| w_ = m->GetGlobalWorkspace(); |
| m_ = m; |
| sptr_ = sptr; |
| entry_ = entry; |
| func_name_ = func_name; |
| arg_size_ = arg_size; |
| launch_param_config_.Init(arg_size.size(), launch_param_tags); |
| } |
| // invoke the function with void arguments |
| void operator()(ffi::PackedArgs args, ffi::Any* rv, void** void_args) const { |
| ICHECK(w_->devices.size() > 0) << "No OpenCL device"; |
| cl::OpenCLThreadEntry* t = w_->GetThreadEntry(); |
| // get the kernel from thread local kernel table. |
| if (entry_.kernel_id >= t->kernel_table.size()) { |
| t->kernel_table.resize(entry_.kernel_id + 1); |
| } |
| const auto& e = t->kernel_table[entry_.kernel_id]; |
| cl_kernel kernel = e.kernel; |
| if (kernel == nullptr || e.version != entry_.version) { |
| kernel = m_->InstallKernel(w_, t, func_name_, entry_); |
| } |
| // setup arguments. |
| for (cl_uint i = 0; i < arg_size_.size(); ++i) { |
| void* arg = nullptr; |
| if (args[i].as<void*>()) { |
| arg = static_cast<cl::BufferDescriptor*>(void_args[i])->buffer; |
| } else { |
| arg = void_args[i]; |
| } |
| OPENCL_CALL(clSetKernelArg(kernel, i, arg_size_[i], arg)); |
| } |
| cl_command_queue queue = w_->GetQueue(t->device); |
| ThreadWorkLoad wl = launch_param_config_.Extract(args); |
| cl_uint work_dim = static_cast<cl_uint>(launch_param_config_.work_dim()); |
| for (cl_uint i = 0; i < work_dim; ++i) { |
| wl.work_size[i] *= wl.work_size[i + 3]; |
| } |
| // launch kernel |
| |
| if (w_->IsProfiling(t->device)) { |
| w_->GetEventQueue(t->device).resize(w_->GetEventQueue(t->device).size() + 1); |
| OPENCL_CALL(clEnqueueNDRangeKernel(queue, kernel, work_dim, nullptr, wl.work_size, |
| wl.work_size + 3, 0, nullptr, |
| &(w_->GetEventQueue(t->device).back()))); |
| } else { |
| OPENCL_CALL(clEnqueueNDRangeKernel(queue, kernel, work_dim, nullptr, wl.work_size, |
| wl.work_size + 3, 0, nullptr, nullptr)); |
| } |
| } |
| |
| private: |
| // global workspace. |
| cl::OpenCLWorkspace* w_; |
| // The module |
| OpenCLModuleNodeBase* m_; |
| // resource handle |
| ObjectPtr<Object> sptr_; |
| // global kernel id in the kernel table. |
| OpenCLModuleNode::KTRefEntry entry_; |
| // The name of the function. |
| std::string func_name_; |
| // convert code for void argument |
| std::vector<size_t> arg_size_; |
| // launch parameters config |
| LaunchParamConfig launch_param_config_; |
| }; |
| |
| OpenCLModuleNodeBase::~OpenCLModuleNodeBase() { |
| { |
| // free the kernel ids in global table. |
| std::lock_guard<std::mutex> lock(workspace_->mu); |
| for (auto& kv : kid_map_) { |
| workspace_->free_kernel_ids.push_back(kv.second.kernel_id); |
| } |
| } |
| // free the kernels |
| for (cl_kernel k : kernels_) { |
| OPENCL_CALL(clReleaseKernel(k)); |
| } |
| // free the programs |
| for (auto& kv : programs_) { |
| for (auto& program : kv.second) { |
| if (program) { |
| OPENCL_CALL(clReleaseProgram(program)); |
| } |
| } |
| } |
| } |
| |
| cl::OpenCLWorkspace* OpenCLModuleNodeBase::GetGlobalWorkspace() { |
| return cl::OpenCLWorkspace::Global(); |
| } |
| |
| ffi::Optional<ffi::Function> OpenCLModuleNodeBase::GetFunction(const ffi::String& name) { |
| ObjectPtr<Object> sptr_to_self = ffi::GetObjectPtr<Object>(this); |
| ICHECK_EQ(sptr_to_self.get(), this); |
| auto it = fmap_.find(name); |
| if (it == fmap_.end()) return std::nullopt; |
| const FunctionInfo& info = it->second; |
| OpenCLWrappedFunc f; |
| std::vector<size_t> arg_size(info.arg_types.size()); |
| for (size_t i = 0; i < info.arg_types.size(); ++i) { |
| DLDataType t = info.arg_types[i]; |
| ICHECK_EQ(t.lanes, 1U); |
| if (t.code == kDLOpaqueHandle) { |
| // specially store pointer type size in OpenCL driver |
| arg_size[i] = sizeof(void*); |
| } else { |
| uint32_t bits = t.bits; |
| ICHECK_EQ(bits % 8, 0U); |
| arg_size[i] = bits / 8; |
| } |
| } |
| // initialize the wrapped func. |
| f.Init(this, sptr_to_self, kid_map_.at(name), name, arg_size, info.launch_param_tags); |
| return PackFuncVoidAddr(f, info.arg_types); |
| } |
| |
| void OpenCLModuleNode::WriteToFile(const ffi::String& file_name, const ffi::String& format) const { |
| std::string fmt = GetFileFormat(file_name, format); |
| ICHECK_EQ(fmt, fmt_) << "Can only save to format=" << fmt_; |
| std::string meta_file = GetMetaFilePath(file_name); |
| SaveMetaDataToFile(meta_file, fmap_); |
| SaveBinaryToFile(file_name, data_); |
| } |
| |
| ffi::Bytes OpenCLModuleNode::SaveToBytes() const { |
| std::string buffer; |
| dmlc::MemoryStringStream ms(&buffer); |
| dmlc::Stream* stream = &ms; |
| stream->Write(fmt_); |
| stream->Write(fmap_); |
| stream->Write(data_); |
| return ffi::Bytes(buffer); |
| } |
| |
| ffi::String OpenCLModuleNode::InspectSource(const ffi::String& format) const { |
| if (format == fmt_) return data_; |
| if (fmt_ == "cl") { |
| return data_; |
| } else { |
| return source_; |
| } |
| } |
| |
| void OpenCLModuleNode::Init() { |
| workspace_ = GetGlobalWorkspace(); |
| // initialize the kernel id, need to lock global table. |
| std::lock_guard<std::mutex> lock(workspace_->mu); |
| for (const auto& kv : fmap_) { |
| const std::string& key = kv.first; |
| KTRefEntry e; |
| if (workspace_->free_kernel_ids.size() != 0) { |
| e.kernel_id = workspace_->free_kernel_ids.back(); |
| workspace_->free_kernel_ids.pop_back(); |
| } else { |
| e.kernel_id = workspace_->num_registered_kernels++; |
| } |
| e.version = workspace_->timestamp++; |
| kid_map_[key] = e; |
| } |
| |
| // split into source artifacts for each kernel |
| parsed_kernels_ = SplitKernels(InspectSource("cl")); |
| ICHECK(!parsed_kernels_.empty()) << "The OpenCL module expects a kernel delimited " |
| << "source from code generation, but no kernel " |
| << "delimiter was found."; |
| ICHECK_EQ(fmap_.size(), parsed_kernels_.size()) |
| << "The number of parsed kernel sources does not match the number of kernel functions"; |
| } |
| |
| bool OpenCLModuleNode::IsProgramCreated(const std::string& func_name, int device_id) { |
| auto size = programs_[func_name].size(); |
| if (size > 0 && programs_[func_name][device_id] != nullptr) return true; |
| auto dev_size = GetGlobalWorkspace()->devices.size(); |
| ICHECK(device_id < static_cast<int>(dev_size)) |
| << "Device id " << device_id << " is bigger than number of available devices"; |
| // zero initialize cl_program pointers for each device kernel |
| if (size == 0) programs_[func_name].resize(dev_size, nullptr); |
| return false; |
| } |
| |
| cl_kernel OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThreadEntry* t, |
| const std::string& func_name, const KTRefEntry& e) { |
| std::lock_guard<std::mutex> lock(build_lock_); |
| int device_id = t->device.device_id; |
| auto did = w->GetCLDeviceID(device_id); |
| auto platform = w->device_info[did].platform_id; |
| if (!IsProgramCreated(func_name, device_id)) { |
| // create program |
| if (fmt_ == "cl") { |
| const char* s = parsed_kernels_[func_name].c_str(); |
| size_t len = parsed_kernels_[func_name].length(); |
| cl_int err; |
| programs_[func_name][device_id] = |
| clCreateProgramWithSource(w->contexts[platform], 1, &s, &len, &err); |
| OPENCL_CHECK_ERROR(err); |
| } else if (fmt_ == "xclbin" || fmt_ == "awsxclbin" || fmt_ == "aocx") { |
| const unsigned char* s = (const unsigned char*)data_.c_str(); |
| size_t len = data_.length(); |
| cl_int err; |
| cl_device_id dev = w->devices[device_id]; |
| programs_[func_name][device_id] = |
| clCreateProgramWithBinary(w->contexts[platform], 1, &dev, &len, &s, nullptr, &err); |
| OPENCL_CHECK_ERROR(err); |
| } else { |
| LOG(FATAL) << "Unknown OpenCL format " << fmt_; |
| } |
| // build program |
| cl_int err; |
| cl_device_id dev = w->devices[device_id]; |
| err = clBuildProgram(programs_[func_name][device_id], 1, &dev, nullptr, nullptr, nullptr); |
| if (err != CL_SUCCESS) { |
| size_t len; |
| std::string log; |
| clGetProgramBuildInfo(programs_[func_name][device_id], dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, |
| &len); |
| log.resize(len); |
| clGetProgramBuildInfo(programs_[func_name][device_id], dev, CL_PROGRAM_BUILD_LOG, len, |
| &log[0], nullptr); |
| LOG(FATAL) << "OpenCL build error for device=" << dev |
| << "\nError: " << cl::CLGetErrorString(err) << "\n" |
| << log; |
| } |
| } |
| // build kernel |
| cl_int err; |
| cl_kernel kernel = clCreateKernel(programs_[func_name][device_id], func_name.c_str(), &err); |
| OPENCL_CHECK_ERROR(err); |
| t->kernel_table[e.kernel_id].kernel = kernel; |
| t->kernel_table[e.kernel_id].version = e.version; |
| kernels_.push_back(kernel); |
| return kernel; |
| } |
| |
| void OpenCLModuleNode::SetPreCompiledPrograms(const std::string& bytes) { |
| workspace_->Init(); |
| std::string data = bytes; |
| dmlc::MemoryStringStream reader(&data); |
| dmlc::Stream* strm = &reader; |
| uint64_t kernels_num; |
| strm->Read(&kernels_num); |
| cl::OpenCLThreadEntry* t = workspace_->GetThreadEntry(); |
| int device_id = t->device.device_id; |
| for (size_t i = 0; i < kernels_num; ++i) { |
| std::string name; |
| std::vector<unsigned char> bin_vector; |
| strm->Read(&name); |
| strm->Read(&bin_vector); |
| if (!IsProgramCreated(name, device_id)) { |
| cl_int err = 0; |
| cl_int binaryStatus; |
| size_t binarySize = bin_vector.size(); |
| const unsigned char* programBinary = bin_vector.data(); |
| |
| cl_device_id dev = workspace_->GetCLDeviceID(device_id); |
| auto platform = workspace_->device_info[dev].platform_id; |
| programs_[name][device_id] = |
| clCreateProgramWithBinary(workspace_->contexts[platform], 1, &dev, &binarySize, |
| &programBinary, &binaryStatus, &err); |
| OPENCL_CHECK_ERROR(err); |
| OPENCL_CHECK_ERROR(binaryStatus); |
| |
| err = clBuildProgram(programs_[name][device_id], 0, nullptr, nullptr, nullptr, nullptr); |
| if (err != CL_SUCCESS) { |
| size_t len; |
| std::string log; |
| clGetProgramBuildInfo(programs_[name][device_id], dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, |
| &len); |
| log.resize(len); |
| clGetProgramBuildInfo(programs_[name][device_id], dev, CL_PROGRAM_BUILD_LOG, len, &log[0], |
| nullptr); |
| LOG(FATAL) << "OpenCL build error for device=" << dev << "\n" << log; |
| } |
| } |
| } |
| } |
| |
| std::string OpenCLModuleNode::GetPreCompiledPrograms() { |
| workspace_->Init(); |
| std::string data; |
| dmlc::MemoryStringStream writer(&data); |
| dmlc::Stream* strm = &writer; |
| strm->Write(static_cast<uint64_t>(parsed_kernels_.size())); |
| for (auto& it : parsed_kernels_) { |
| std::string name = it.first; |
| cl::OpenCLThreadEntry* t = workspace_->GetThreadEntry(); |
| int device_id = t->device.device_id; |
| t->kernel_table.resize(workspace_->num_registered_kernels); |
| if (!IsProgramCreated(name, device_id)) { |
| InstallKernel(workspace_, t, name, kid_map_[name]); |
| } |
| size_t size; |
| clGetProgramInfo(programs_[name][device_id], CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, |
| nullptr); |
| ICHECK(size > 0) << "Size of binary is 0"; |
| std::vector<unsigned char> bin_vector(size); |
| unsigned char* binary = bin_vector.data(); |
| clGetProgramInfo(programs_[name][device_id], CL_PROGRAM_BINARIES, sizeof(unsigned char*), |
| &binary, nullptr); |
| |
| strm->Write(name); |
| strm->Write(bin_vector); |
| } |
| return data; |
| } |
| |
| ffi::Optional<ffi::Function> OpenCLModuleNode::GetFunction(const ffi::String& name) { |
| ObjectPtr<Object> sptr_to_self = ffi::GetObjectPtr<Object>(this); |
| ICHECK_EQ(sptr_to_self.get(), this); |
| if (name == "opencl.GetPreCompiledPrograms") { |
| return ffi::Function([sptr_to_self, this](ffi::PackedArgs args, ffi::Any* rv) { |
| *rv = this->GetPreCompiledPrograms(); |
| }); |
| } else if (name == "opencl.SetPreCompiledPrograms") { |
| return ffi::Function([sptr_to_self, this](ffi::PackedArgs args, ffi::Any* rv) { |
| this->SetPreCompiledPrograms(args[0].cast<std::string>()); |
| }); |
| } |
| return OpenCLModuleNodeBase::GetFunction(name); |
| } |
| |
| ffi::Module OpenCLModuleCreate(std::string data, std::string fmt, |
| std::unordered_map<std::string, FunctionInfo> fmap, |
| std::string source) { |
| auto n = ffi::make_object<OpenCLModuleNode>(data, fmt, fmap, source); |
| n->Init(); |
| return ffi::Module(n); |
| } |
| |
| // Load module from module. |
| ffi::Module OpenCLModuleLoadFile(const std::string& file_name, const ffi::String& format) { |
| std::string data; |
| std::unordered_map<std::string, FunctionInfo> fmap; |
| std::string fmt = GetFileFormat(file_name, format); |
| std::string meta_file = GetMetaFilePath(file_name); |
| LoadBinaryFromFile(file_name, &data); |
| LoadMetaDataFromFile(meta_file, &fmap); |
| return OpenCLModuleCreate(data, fmt, fmap, std::string()); |
| } |
| |
| ffi::Module OpenCLModuleLoadFromBytes(const ffi::Bytes& bytes) { |
| dmlc::MemoryFixedSizeStream ms(const_cast<char*>(bytes.data()), bytes.size()); |
| dmlc::Stream* stream = &ms; |
| std::string data; |
| std::unordered_map<std::string, FunctionInfo> fmap; |
| std::string fmt; |
| stream->Read(&fmt); |
| stream->Read(&fmap); |
| stream->Read(&data); |
| return OpenCLModuleCreate(data, fmt, fmap, std::string()); |
| } |
| |
| TVM_FFI_STATIC_INIT_BLOCK() { |
| namespace refl = tvm::ffi::reflection; |
| refl::GlobalDef() |
| .def("ffi.Module.load_from_file.cl", OpenCLModuleLoadFile) |
| .def("ffi.Module.load_from_file.clbin", OpenCLModuleLoadFile) |
| .def("ffi.Module.load_from_bytes.opencl", OpenCLModuleLoadFromBytes); |
| } |
| } // namespace runtime |
| } // namespace tvm |