| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| #include "vulkan_device_api.h" |
| |
| #include <tvm/ffi/reflection/registry.h> |
| |
| #include <algorithm> |
| #include <memory> |
| #include <set> |
| #include <string> |
| #include <utility> |
| |
| #include "vulkan_common.h" |
| |
| namespace tvm { |
| namespace runtime { |
| namespace vulkan { |
| |
| VulkanDeviceAPI* VulkanDeviceAPI::Global() { |
| // Most of the TVM Global() functions allocate with "new" and do |
| // not deallocate, as the OS can clean up any leftover buffers at |
| // the end. In this case, we need the VulkanDeviceAPI destructor |
| // to call vkDestroyInstance, to prevent a segfault on exit when |
| // using some nvidia drivers. |
| static VulkanDeviceAPI inst; |
| return &inst; |
| } |
| |
| VulkanDeviceAPI::VulkanDeviceAPI() { |
| std::vector<VkPhysicalDevice> vulkan_physical_devices = instance_.GetPhysicalDevices(); |
| for (VkPhysicalDevice phy_dev : vulkan_physical_devices) { |
| VulkanDevice device(instance_, phy_dev); |
| |
| if (device.SupportsCompute()) { |
| devices_.push_back(std::move(device)); |
| } |
| } |
| |
| // Move discrete GPUs to the start of the list, so the default |
| // device_id=0 preferentially uses a discrete GPU. |
| auto preference = [](const VulkanDevice& device) { |
| const std::string& type = device.device_properties.device_type; |
| if (type == "discrete") { |
| return 0; |
| } else if (type == "integrated") { |
| return 1; |
| } else if (type == "virtual") { |
| return 2; |
| } else if (type == "cpu") { |
| return 3; |
| } else { |
| return 4; |
| } |
| }; |
| |
| std::stable_sort(devices_.begin(), devices_.end(), |
| [&preference](const VulkanDevice& a, const VulkanDevice& b) { |
| return preference(a) < preference(b); |
| }); |
| } |
| |
| VulkanDeviceAPI::~VulkanDeviceAPI() {} |
| |
| void VulkanDeviceAPI::SetDevice(Device dev) { |
| TVM_FFI_ICHECK_EQ(dev.device_type, kDLVulkan) |
| << "Active vulkan device cannot be set to non-vulkan device" << dev; |
| |
| TVM_FFI_ICHECK_LE(dev.device_id, static_cast<int>(devices_.size())) |
| << "Attempted to set active vulkan device to device_id==" << dev.device_id << ", but only " |
| << devices_.size() << " devices present"; |
| |
| active_device_id_per_thread.GetOrMake(0) = dev.device_id; |
| } |
| |
| int VulkanDeviceAPI::GetActiveDeviceID() { return active_device_id_per_thread.GetOrMake(0); } |
| |
| VulkanDevice& VulkanDeviceAPI::GetActiveDevice() { return device(GetActiveDeviceID()); } |
| |
| void VulkanDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, ffi::Any* rv) { |
| size_t index = static_cast<size_t>(dev.device_id); |
| if (kind == kExist) { |
| *rv = static_cast<int>(index < devices_.size()); |
| return; |
| } |
| |
| const auto& prop = device(index).device_properties; |
| |
| switch (kind) { |
| case kMaxThreadsPerBlock: { |
| *rv = int64_t(prop.max_num_threads); |
| break; |
| } |
| case kMaxSharedMemoryPerBlock: { |
| *rv = int64_t(prop.max_shared_memory_per_block); |
| break; |
| } |
| case kWarpSize: { |
| *rv = int64_t(prop.thread_warp_size); |
| break; |
| } |
| case kComputeVersion: { |
| int64_t value = prop.vulkan_api_version; |
| std::ostringstream os; |
| os << VK_VERSION_MAJOR(value) << "." << VK_VERSION_MINOR(value) << "." |
| << VK_VERSION_PATCH(value); |
| *rv = os.str(); |
| break; |
| } |
| case kDeviceName: |
| *rv = std::string(prop.device_name); |
| break; |
| |
| case kMaxClockRate: |
| break; |
| |
| case kMultiProcessorCount: |
| break; |
| |
| case kExist: |
| break; |
| |
| case kMaxThreadDimensions: { |
| std::stringstream ss; // use json string to return multiple int values; |
| ss << "[" << prop.max_block_size_x << ", " << prop.max_block_size_y << ", " |
| << prop.max_block_size_z << "]"; |
| *rv = ss.str(); |
| break; |
| } |
| |
| case kMaxRegistersPerBlock: |
| break; |
| |
| case kGcnArch: |
| break; |
| |
| case kApiVersion: |
| *rv = VK_HEADER_VERSION; |
| break; |
| |
| case kDriverVersion: { |
| int64_t value = prop.driver_version; |
| std::ostringstream os; |
| os << VK_VERSION_MAJOR(value) << "." << VK_VERSION_MINOR(value) << "." |
| << VK_VERSION_PATCH(value); |
| *rv = os.str(); |
| break; |
| } |
| |
| case kL2CacheSizeBytes: |
| break; |
| |
| case kTotalGlobalMemory: { |
| *rv = device(index).compute_memory_size; |
| return; |
| } |
| case kAvailableGlobalMemory: |
| // Not currently implemented. Will only be implementable for |
| // devices that support the VK_EXT_memory_budget extension. |
| break; |
| case kImagePitchAlignment: |
| return; |
| } |
| } |
| |
| void VulkanDeviceAPI::GetTargetProperty(Device dev, const std::string& property, ffi::Any* rv) { |
| size_t index = static_cast<size_t>(dev.device_id); |
| const auto& prop = device(index).device_properties; |
| |
| if (property == "supports_float16") { |
| *rv = prop.supports_float16; |
| } |
| if (property == "supports_float32") { |
| *rv = prop.supports_float32; |
| } |
| if (property == "supports_float64") { |
| *rv = prop.supports_float64; |
| } |
| if (property == "supports_int8") { |
| *rv = prop.supports_int8; |
| } |
| if (property == "supports_int16") { |
| *rv = prop.supports_int16; |
| } |
| if (property == "supports_int32") { |
| *rv = prop.supports_int32; |
| } |
| if (property == "supports_int64") { |
| *rv = prop.supports_int64; |
| } |
| if (property == "supports_8bit_buffer") { |
| *rv = prop.supports_8bit_buffer; |
| } |
| if (property == "supports_16bit_buffer") { |
| *rv = prop.supports_16bit_buffer; |
| } |
| if (property == "supports_storage_buffer_storage_class") { |
| *rv = prop.supports_storage_buffer_storage_class; |
| } |
| if (property == "supports_push_descriptor") { |
| *rv = prop.supports_push_descriptor; |
| } |
| if (property == "supports_dedicated_allocation") { |
| *rv = prop.supports_dedicated_allocation; |
| } |
| if (property == "supported_subgroup_operations") { |
| *rv = int64_t(prop.supported_subgroup_operations); |
| } |
| if (property == "max_num_threads") { |
| *rv = int64_t(prop.max_num_threads); |
| } |
| if (property == "thread_warp_size") { |
| *rv = int64_t(prop.thread_warp_size); |
| } |
| if (property == "max_block_size_x") { |
| *rv = int64_t(prop.max_block_size_x); |
| } |
| if (property == "max_block_size_y") { |
| *rv = int64_t(prop.max_block_size_y); |
| } |
| if (property == "max_block_size_z") { |
| *rv = int64_t(prop.max_block_size_z); |
| } |
| if (property == "max_push_constants_size") { |
| *rv = int64_t(prop.max_push_constants_size); |
| } |
| if (property == "max_uniform_buffer_range") { |
| *rv = int64_t(prop.max_uniform_buffer_range); |
| } |
| if (property == "max_storage_buffer_range") { |
| *rv = int64_t(prop.max_storage_buffer_range); |
| } |
| if (property == "max_per_stage_descriptor_storage_buffer") { |
| *rv = int64_t(prop.max_per_stage_descriptor_storage_buffer); |
| } |
| if (property == "max_shared_memory_per_block") { |
| *rv = int64_t(prop.max_shared_memory_per_block); |
| } |
| |
| if (property == "supports_integer_dot_product") { |
| *rv = prop.supports_integer_dot_product; |
| } |
| |
| if (property == "supports_cooperative_matrix") { |
| *rv = prop.supports_cooperative_matrix; |
| } |
| |
| if (property == "device_name") { |
| *rv = prop.device_name; |
| } |
| if (property == "device_type") { |
| *rv = prop.device_type; |
| } |
| if (property == "driver_name") { |
| *rv = prop.driver_name; |
| } |
| if (property == "driver_version") { |
| *rv = int64_t(prop.driver_version); |
| } |
| if (property == "vulkan_api_version") { |
| *rv = int64_t(prop.vulkan_api_version); |
| } |
| if (property == "max_spirv_version") { |
| *rv = int64_t(prop.max_spirv_version); |
| } |
| } |
| |
| void* VulkanDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignment, |
| DLDataType type_hint) { |
| if (nbytes == 0) { |
| // Vulkan seems to have issues if we return nullptr on zero size alloc |
| nbytes = 1; |
| } |
| const auto& device = this->device(dev.device_id); |
| auto usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | |
| VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; |
| return new VulkanBuffer(device, nbytes, usage, device.compute_mtype_index); |
| } |
| |
| void VulkanDeviceAPI::FreeDataSpace(Device dev, void* ptr) { |
| // Before releasing the vkBuffer, call sync to |
| // finish all the vulkan commands that reference the buffer. |
| StreamSync(dev, nullptr); |
| |
| auto* pbuf = static_cast<VulkanBuffer*>(ptr); |
| delete pbuf; |
| } |
| |
| void* VulkanDeviceAPI::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) { |
| auto& pool = pool_per_thread.GetOrMake(kDLVulkan, this); |
| return pool.AllocWorkspace(dev, size); |
| } |
| |
| void VulkanDeviceAPI::FreeWorkspace(Device dev, void* data) { |
| auto* pool = pool_per_thread.Get(); |
| TVM_FFI_ICHECK(pool) << "Attempted to free a vulkan workspace on a CPU-thread " |
| << "that has never allocated a workspace"; |
| pool->FreeWorkspace(dev, data); |
| } |
| |
| TVMStreamHandle VulkanDeviceAPI::CreateStream(Device dev) { return nullptr; } |
| |
| void VulkanDeviceAPI::FreeStream(Device dev, TVMStreamHandle stream) { |
| TVM_FFI_ICHECK_EQ(stream, static_cast<void*>(nullptr)); |
| } |
| |
| // Syncing two streams is a nop, since there is only one stream. |
| void VulkanDeviceAPI::SyncStreamFromTo(Device dev, TVMStreamHandle event_src, |
| TVMStreamHandle event_dst) { |
| TVM_FFI_ICHECK_EQ(event_src, static_cast<void*>(nullptr)); |
| TVM_FFI_ICHECK_EQ(event_dst, static_cast<void*>(nullptr)); |
| } |
| |
| void VulkanDeviceAPI::StreamSync(Device dev, TVMStreamHandle stream) { |
| TVM_FFI_ICHECK_EQ(stream, static_cast<void*>(nullptr)); |
| device(dev.device_id).ThreadLocalStream().Synchronize(); |
| } |
| |
| void VulkanDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to, |
| size_t to_offset, size_t size, Device dev_from, Device dev_to, |
| DLDataType type_hint, TVMStreamHandle stream) { |
| TVM_FFI_ICHECK(stream == nullptr); |
| Device dev = dev_from; |
| if (dev_from.device_type == kDLCPU) { |
| dev = dev_to; |
| } |
| |
| int from_dev_type = static_cast<int>(dev_from.device_type); |
| int to_dev_type = static_cast<int>(dev_to.device_type); |
| if (from_dev_type == kDLVulkan && to_dev_type == kDLVulkan) { |
| TVM_FFI_ICHECK_EQ(dev_from.device_id, dev_to.device_id) |
| << "The Vulkan runtime does not support deviceA to deviceB copies. " |
| << "This should be changed to a deviceA to CPU copy, followed by a CPU to deviceB copy"; |
| |
| device(dev_from.device_id).ThreadLocalStream().Launch([=](VulkanStreamState* state) { |
| // 1: copy |
| const auto* from_buf = static_cast<const VulkanBuffer*>(from); |
| auto* to_buf = static_cast<VulkanBuffer*>(to); |
| VkBufferCopy copy_info; |
| copy_info.srcOffset = from_offset; |
| copy_info.dstOffset = to_offset; |
| copy_info.size = size; |
| vkCmdCopyBuffer(state->cmd_buffer_, from_buf->buffer, to_buf->buffer, 1, ©_info); |
| // 2: barrier(transfer-> compute|transfer) |
| VkMemoryBarrier barrier_info; |
| barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER; |
| barrier_info.pNext = nullptr; |
| barrier_info.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; |
| barrier_info.dstAccessMask = (VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT | |
| VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT); |
| vkCmdPipelineBarrier(state->cmd_buffer_, VK_PIPELINE_STAGE_TRANSFER_BIT, |
| VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, |
| 1, &barrier_info, 0, nullptr, 0, nullptr); |
| }); |
| |
| } else if (from_dev_type == kDLVulkan && to_dev_type == kDLCPU) { |
| const auto* from_buf = static_cast<const VulkanBuffer*>(from); |
| auto& device = this->device(dev_from.device_id); |
| auto& stream = device.ThreadLocalStream(); |
| auto& staging_buffer = device.ThreadLocalStagingBuffer(size); |
| stream.Launch([&](VulkanStreamState* state) { |
| VkBufferCopy copy_info; |
| copy_info.srcOffset = from_offset; |
| copy_info.dstOffset = 0; |
| copy_info.size = size; |
| vkCmdCopyBuffer(state->cmd_buffer_, from_buf->buffer, staging_buffer.vk_buf.buffer, 1, |
| ©_info); |
| }); |
| stream.Synchronize(); |
| stream.ProfilerReset(); |
| if (!device.coherent_staging) { |
| VkMappedMemoryRange mrange; |
| mrange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; |
| mrange.pNext = nullptr; |
| mrange.memory = staging_buffer.vk_buf.memory; |
| mrange.offset = 0; |
| mrange.size = VK_WHOLE_SIZE; // size; |
| VULKAN_CALL(vkInvalidateMappedMemoryRanges(device, 1, &mrange)); |
| } |
| memcpy(static_cast<char*>(to) + to_offset, static_cast<char*>(staging_buffer.host_addr), size); |
| } else if (from_dev_type == kDLCPU && to_dev_type == kDLVulkan) { |
| auto& device = this->device(dev_to.device_id); |
| auto& stream = device.ThreadLocalStream(); |
| const auto* to_buf = static_cast<const VulkanBuffer*>(to); |
| auto& staging_buffer = device.ThreadLocalStagingBuffer(size); |
| memcpy(staging_buffer.host_addr, static_cast<const char*>(from) + from_offset, size); |
| // host side flush if access is not coherent. |
| // so writes from CPU is visible to GPU |
| if (!device.coherent_staging) { |
| VkMappedMemoryRange mrange; |
| mrange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; |
| mrange.pNext = nullptr; |
| mrange.memory = staging_buffer.vk_buf.memory; |
| mrange.offset = 0; |
| mrange.size = VK_WHOLE_SIZE; // size; |
| VULKAN_CALL(vkFlushMappedMemoryRanges(device, 1, &mrange)); |
| } |
| |
| stream.Launch([&](VulkanStreamState* state) { |
| // 0: barrier(host->transfer) |
| VkMemoryBarrier barrier_info; |
| barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER; |
| barrier_info.pNext = nullptr; |
| barrier_info.srcAccessMask = 0; |
| barrier_info.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; |
| vkCmdPipelineBarrier(state->cmd_buffer_, VK_PIPELINE_STAGE_HOST_BIT, |
| VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1, &barrier_info, 0, nullptr, 0, |
| nullptr); |
| // 1: copy |
| VkBufferCopy copy_info; |
| copy_info.srcOffset = 0; |
| copy_info.dstOffset = to_offset; |
| copy_info.size = size; |
| vkCmdCopyBuffer(state->cmd_buffer_, staging_buffer.vk_buf.buffer, to_buf->buffer, 1, |
| ©_info); |
| }); |
| |
| stream.ProfilerReady(); |
| // TODO(tulloch): should we instead make the staging buffer a property of the |
| // Stream? This would allow us to elide synchronizations here. |
| stream.Synchronize(); |
| } else { |
| TVM_FFI_THROW(InternalError) << "Expect copy from/to Vulkan or between Vulkan" |
| << ", from=" << from_dev_type << ", to=" << to_dev_type; |
| } |
| } |
| |
| const VulkanDevice& VulkanDeviceAPI::device(size_t device_id) const { |
| TVM_FFI_ICHECK_LT(device_id, devices_.size()) |
| << "Requested Vulkan device_id=" << device_id << ", but only " << devices_.size() |
| << " devices present"; |
| return devices_[device_id]; |
| } |
| |
| VulkanDevice& VulkanDeviceAPI::device(size_t device_id) { |
| return const_cast<VulkanDevice&>(const_cast<const VulkanDeviceAPI*>(this)->device(device_id)); |
| } |
| |
| TVM_FFI_STATIC_INIT_BLOCK() { |
| namespace refl = tvm::ffi::reflection; |
| refl::GlobalDef() |
| .def_packed("device_api.vulkan", |
| [](ffi::PackedArgs args, ffi::Any* rv) { |
| DeviceAPI* ptr = VulkanDeviceAPI::Global(); |
| *rv = static_cast<void*>(ptr); |
| }) |
| .def("device_api.vulkan.get_target_property", [](Device dev, const std::string& property) { |
| ffi::Any rv; |
| VulkanDeviceAPI::Global()->GetTargetProperty(dev, property, &rv); |
| return rv; |
| }); |
| } |
| |
| } // namespace vulkan |
| } // namespace runtime |
| } // namespace tvm |