blob: 3dc5f146dd6ae2bb1b36105907a18bd2a3856bce [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include "vulkan_device_api.h"
#include <tvm/ffi/reflection/registry.h>
#include <algorithm>
#include <memory>
#include <set>
#include <string>
#include <utility>
#include "vulkan_common.h"
namespace tvm {
namespace runtime {
namespace vulkan {
VulkanDeviceAPI* VulkanDeviceAPI::Global() {
// Most of the TVM Global() functions allocate with "new" and do
// not deallocate, as the OS can clean up any leftover buffers at
// the end. In this case, we need the VulkanDeviceAPI destructor
// to call vkDestroyInstance, to prevent a segfault on exit when
// using some nvidia drivers.
static VulkanDeviceAPI inst;
return &inst;
}
VulkanDeviceAPI::VulkanDeviceAPI() {
std::vector<VkPhysicalDevice> vulkan_physical_devices = instance_.GetPhysicalDevices();
for (VkPhysicalDevice phy_dev : vulkan_physical_devices) {
VulkanDevice device(instance_, phy_dev);
if (device.SupportsCompute()) {
devices_.push_back(std::move(device));
}
}
// Move discrete GPUs to the start of the list, so the default
// device_id=0 preferentially uses a discrete GPU.
auto preference = [](const VulkanDevice& device) {
const std::string& type = device.device_properties.device_type;
if (type == "discrete") {
return 0;
} else if (type == "integrated") {
return 1;
} else if (type == "virtual") {
return 2;
} else if (type == "cpu") {
return 3;
} else {
return 4;
}
};
std::stable_sort(devices_.begin(), devices_.end(),
[&preference](const VulkanDevice& a, const VulkanDevice& b) {
return preference(a) < preference(b);
});
}
VulkanDeviceAPI::~VulkanDeviceAPI() {}
void VulkanDeviceAPI::SetDevice(Device dev) {
TVM_FFI_ICHECK_EQ(dev.device_type, kDLVulkan)
<< "Active vulkan device cannot be set to non-vulkan device" << dev;
TVM_FFI_ICHECK_LE(dev.device_id, static_cast<int>(devices_.size()))
<< "Attempted to set active vulkan device to device_id==" << dev.device_id << ", but only "
<< devices_.size() << " devices present";
active_device_id_per_thread.GetOrMake(0) = dev.device_id;
}
int VulkanDeviceAPI::GetActiveDeviceID() { return active_device_id_per_thread.GetOrMake(0); }
VulkanDevice& VulkanDeviceAPI::GetActiveDevice() { return device(GetActiveDeviceID()); }
void VulkanDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, ffi::Any* rv) {
size_t index = static_cast<size_t>(dev.device_id);
if (kind == kExist) {
*rv = static_cast<int>(index < devices_.size());
return;
}
const auto& prop = device(index).device_properties;
switch (kind) {
case kMaxThreadsPerBlock: {
*rv = int64_t(prop.max_num_threads);
break;
}
case kMaxSharedMemoryPerBlock: {
*rv = int64_t(prop.max_shared_memory_per_block);
break;
}
case kWarpSize: {
*rv = int64_t(prop.thread_warp_size);
break;
}
case kComputeVersion: {
int64_t value = prop.vulkan_api_version;
std::ostringstream os;
os << VK_VERSION_MAJOR(value) << "." << VK_VERSION_MINOR(value) << "."
<< VK_VERSION_PATCH(value);
*rv = os.str();
break;
}
case kDeviceName:
*rv = std::string(prop.device_name);
break;
case kMaxClockRate:
break;
case kMultiProcessorCount:
break;
case kExist:
break;
case kMaxThreadDimensions: {
std::stringstream ss; // use json string to return multiple int values;
ss << "[" << prop.max_block_size_x << ", " << prop.max_block_size_y << ", "
<< prop.max_block_size_z << "]";
*rv = ss.str();
break;
}
case kMaxRegistersPerBlock:
break;
case kGcnArch:
break;
case kApiVersion:
*rv = VK_HEADER_VERSION;
break;
case kDriverVersion: {
int64_t value = prop.driver_version;
std::ostringstream os;
os << VK_VERSION_MAJOR(value) << "." << VK_VERSION_MINOR(value) << "."
<< VK_VERSION_PATCH(value);
*rv = os.str();
break;
}
case kL2CacheSizeBytes:
break;
case kTotalGlobalMemory: {
*rv = device(index).compute_memory_size;
return;
}
case kAvailableGlobalMemory:
// Not currently implemented. Will only be implementable for
// devices that support the VK_EXT_memory_budget extension.
break;
case kImagePitchAlignment:
return;
}
}
void VulkanDeviceAPI::GetTargetProperty(Device dev, const std::string& property, ffi::Any* rv) {
size_t index = static_cast<size_t>(dev.device_id);
const auto& prop = device(index).device_properties;
if (property == "supports_float16") {
*rv = prop.supports_float16;
}
if (property == "supports_float32") {
*rv = prop.supports_float32;
}
if (property == "supports_float64") {
*rv = prop.supports_float64;
}
if (property == "supports_int8") {
*rv = prop.supports_int8;
}
if (property == "supports_int16") {
*rv = prop.supports_int16;
}
if (property == "supports_int32") {
*rv = prop.supports_int32;
}
if (property == "supports_int64") {
*rv = prop.supports_int64;
}
if (property == "supports_8bit_buffer") {
*rv = prop.supports_8bit_buffer;
}
if (property == "supports_16bit_buffer") {
*rv = prop.supports_16bit_buffer;
}
if (property == "supports_storage_buffer_storage_class") {
*rv = prop.supports_storage_buffer_storage_class;
}
if (property == "supports_push_descriptor") {
*rv = prop.supports_push_descriptor;
}
if (property == "supports_dedicated_allocation") {
*rv = prop.supports_dedicated_allocation;
}
if (property == "supported_subgroup_operations") {
*rv = int64_t(prop.supported_subgroup_operations);
}
if (property == "max_num_threads") {
*rv = int64_t(prop.max_num_threads);
}
if (property == "thread_warp_size") {
*rv = int64_t(prop.thread_warp_size);
}
if (property == "max_block_size_x") {
*rv = int64_t(prop.max_block_size_x);
}
if (property == "max_block_size_y") {
*rv = int64_t(prop.max_block_size_y);
}
if (property == "max_block_size_z") {
*rv = int64_t(prop.max_block_size_z);
}
if (property == "max_push_constants_size") {
*rv = int64_t(prop.max_push_constants_size);
}
if (property == "max_uniform_buffer_range") {
*rv = int64_t(prop.max_uniform_buffer_range);
}
if (property == "max_storage_buffer_range") {
*rv = int64_t(prop.max_storage_buffer_range);
}
if (property == "max_per_stage_descriptor_storage_buffer") {
*rv = int64_t(prop.max_per_stage_descriptor_storage_buffer);
}
if (property == "max_shared_memory_per_block") {
*rv = int64_t(prop.max_shared_memory_per_block);
}
if (property == "supports_integer_dot_product") {
*rv = prop.supports_integer_dot_product;
}
if (property == "supports_cooperative_matrix") {
*rv = prop.supports_cooperative_matrix;
}
if (property == "device_name") {
*rv = prop.device_name;
}
if (property == "device_type") {
*rv = prop.device_type;
}
if (property == "driver_name") {
*rv = prop.driver_name;
}
if (property == "driver_version") {
*rv = int64_t(prop.driver_version);
}
if (property == "vulkan_api_version") {
*rv = int64_t(prop.vulkan_api_version);
}
if (property == "max_spirv_version") {
*rv = int64_t(prop.max_spirv_version);
}
}
void* VulkanDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignment,
DLDataType type_hint) {
if (nbytes == 0) {
// Vulkan seems to have issues if we return nullptr on zero size alloc
nbytes = 1;
}
const auto& device = this->device(dev.device_id);
auto usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
return new VulkanBuffer(device, nbytes, usage, device.compute_mtype_index);
}
void VulkanDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
// Before releasing the vkBuffer, call sync to
// finish all the vulkan commands that reference the buffer.
StreamSync(dev, nullptr);
auto* pbuf = static_cast<VulkanBuffer*>(ptr);
delete pbuf;
}
void* VulkanDeviceAPI::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
auto& pool = pool_per_thread.GetOrMake(kDLVulkan, this);
return pool.AllocWorkspace(dev, size);
}
void VulkanDeviceAPI::FreeWorkspace(Device dev, void* data) {
auto* pool = pool_per_thread.Get();
TVM_FFI_ICHECK(pool) << "Attempted to free a vulkan workspace on a CPU-thread "
<< "that has never allocated a workspace";
pool->FreeWorkspace(dev, data);
}
TVMStreamHandle VulkanDeviceAPI::CreateStream(Device dev) { return nullptr; }
void VulkanDeviceAPI::FreeStream(Device dev, TVMStreamHandle stream) {
TVM_FFI_ICHECK_EQ(stream, static_cast<void*>(nullptr));
}
// Syncing two streams is a nop, since there is only one stream.
void VulkanDeviceAPI::SyncStreamFromTo(Device dev, TVMStreamHandle event_src,
TVMStreamHandle event_dst) {
TVM_FFI_ICHECK_EQ(event_src, static_cast<void*>(nullptr));
TVM_FFI_ICHECK_EQ(event_dst, static_cast<void*>(nullptr));
}
void VulkanDeviceAPI::StreamSync(Device dev, TVMStreamHandle stream) {
TVM_FFI_ICHECK_EQ(stream, static_cast<void*>(nullptr));
device(dev.device_id).ThreadLocalStream().Synchronize();
}
void VulkanDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to,
size_t to_offset, size_t size, Device dev_from, Device dev_to,
DLDataType type_hint, TVMStreamHandle stream) {
TVM_FFI_ICHECK(stream == nullptr);
Device dev = dev_from;
if (dev_from.device_type == kDLCPU) {
dev = dev_to;
}
int from_dev_type = static_cast<int>(dev_from.device_type);
int to_dev_type = static_cast<int>(dev_to.device_type);
if (from_dev_type == kDLVulkan && to_dev_type == kDLVulkan) {
TVM_FFI_ICHECK_EQ(dev_from.device_id, dev_to.device_id)
<< "The Vulkan runtime does not support deviceA to deviceB copies. "
<< "This should be changed to a deviceA to CPU copy, followed by a CPU to deviceB copy";
device(dev_from.device_id).ThreadLocalStream().Launch([=](VulkanStreamState* state) {
// 1: copy
const auto* from_buf = static_cast<const VulkanBuffer*>(from);
auto* to_buf = static_cast<VulkanBuffer*>(to);
VkBufferCopy copy_info;
copy_info.srcOffset = from_offset;
copy_info.dstOffset = to_offset;
copy_info.size = size;
vkCmdCopyBuffer(state->cmd_buffer_, from_buf->buffer, to_buf->buffer, 1, &copy_info);
// 2: barrier(transfer-> compute|transfer)
VkMemoryBarrier barrier_info;
barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
barrier_info.pNext = nullptr;
barrier_info.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
barrier_info.dstAccessMask = (VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT |
VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT);
vkCmdPipelineBarrier(state->cmd_buffer_, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0,
1, &barrier_info, 0, nullptr, 0, nullptr);
});
} else if (from_dev_type == kDLVulkan && to_dev_type == kDLCPU) {
const auto* from_buf = static_cast<const VulkanBuffer*>(from);
auto& device = this->device(dev_from.device_id);
auto& stream = device.ThreadLocalStream();
auto& staging_buffer = device.ThreadLocalStagingBuffer(size);
stream.Launch([&](VulkanStreamState* state) {
VkBufferCopy copy_info;
copy_info.srcOffset = from_offset;
copy_info.dstOffset = 0;
copy_info.size = size;
vkCmdCopyBuffer(state->cmd_buffer_, from_buf->buffer, staging_buffer.vk_buf.buffer, 1,
&copy_info);
});
stream.Synchronize();
stream.ProfilerReset();
if (!device.coherent_staging) {
VkMappedMemoryRange mrange;
mrange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
mrange.pNext = nullptr;
mrange.memory = staging_buffer.vk_buf.memory;
mrange.offset = 0;
mrange.size = VK_WHOLE_SIZE; // size;
VULKAN_CALL(vkInvalidateMappedMemoryRanges(device, 1, &mrange));
}
memcpy(static_cast<char*>(to) + to_offset, static_cast<char*>(staging_buffer.host_addr), size);
} else if (from_dev_type == kDLCPU && to_dev_type == kDLVulkan) {
auto& device = this->device(dev_to.device_id);
auto& stream = device.ThreadLocalStream();
const auto* to_buf = static_cast<const VulkanBuffer*>(to);
auto& staging_buffer = device.ThreadLocalStagingBuffer(size);
memcpy(staging_buffer.host_addr, static_cast<const char*>(from) + from_offset, size);
// host side flush if access is not coherent.
// so writes from CPU is visible to GPU
if (!device.coherent_staging) {
VkMappedMemoryRange mrange;
mrange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
mrange.pNext = nullptr;
mrange.memory = staging_buffer.vk_buf.memory;
mrange.offset = 0;
mrange.size = VK_WHOLE_SIZE; // size;
VULKAN_CALL(vkFlushMappedMemoryRanges(device, 1, &mrange));
}
stream.Launch([&](VulkanStreamState* state) {
// 0: barrier(host->transfer)
VkMemoryBarrier barrier_info;
barrier_info.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
barrier_info.pNext = nullptr;
barrier_info.srcAccessMask = 0;
barrier_info.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
vkCmdPipelineBarrier(state->cmd_buffer_, VK_PIPELINE_STAGE_HOST_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 1, &barrier_info, 0, nullptr, 0,
nullptr);
// 1: copy
VkBufferCopy copy_info;
copy_info.srcOffset = 0;
copy_info.dstOffset = to_offset;
copy_info.size = size;
vkCmdCopyBuffer(state->cmd_buffer_, staging_buffer.vk_buf.buffer, to_buf->buffer, 1,
&copy_info);
});
stream.ProfilerReady();
// TODO(tulloch): should we instead make the staging buffer a property of the
// Stream? This would allow us to elide synchronizations here.
stream.Synchronize();
} else {
TVM_FFI_THROW(InternalError) << "Expect copy from/to Vulkan or between Vulkan"
<< ", from=" << from_dev_type << ", to=" << to_dev_type;
}
}
const VulkanDevice& VulkanDeviceAPI::device(size_t device_id) const {
TVM_FFI_ICHECK_LT(device_id, devices_.size())
<< "Requested Vulkan device_id=" << device_id << ", but only " << devices_.size()
<< " devices present";
return devices_[device_id];
}
VulkanDevice& VulkanDeviceAPI::device(size_t device_id) {
return const_cast<VulkanDevice&>(const_cast<const VulkanDeviceAPI*>(this)->device(device_id));
}
TVM_FFI_STATIC_INIT_BLOCK() {
namespace refl = tvm::ffi::reflection;
refl::GlobalDef()
.def_packed("device_api.vulkan",
[](ffi::PackedArgs args, ffi::Any* rv) {
DeviceAPI* ptr = VulkanDeviceAPI::Global();
*rv = static_cast<void*>(ptr);
})
.def("device_api.vulkan.get_target_property", [](Device dev, const std::string& property) {
ffi::Any rv;
VulkanDeviceAPI::Global()->GetTargetProperty(dev, property, &rv);
return rv;
});
}
} // namespace vulkan
} // namespace runtime
} // namespace tvm