| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| /*! |
| * \file codegen_amdgpu.cc |
| * \brief AMDGPU code generator. |
| */ |
| #ifdef TVM_LLVM_VERSION |
| |
| #include <llvm/ADT/SmallString.h> |
| #include <llvm/IR/Attributes.h> |
| #include <llvm/IR/CallingConv.h> |
| #include <llvm/IR/Function.h> |
| #include <llvm/IR/GlobalValue.h> |
| #include <llvm/IR/Instructions.h> |
| #include <llvm/IR/Intrinsics.h> |
| #include <llvm/IR/IntrinsicsAMDGPU.h> |
| #include <llvm/IR/LegacyPassManager.h> |
| #include <llvm/IRReader/IRReader.h> |
| #include <llvm/Support/Alignment.h> |
| #include <llvm/Support/CodeGen.h> |
| #include <llvm/Support/SourceMgr.h> |
| #include <llvm/Support/raw_ostream.h> |
| #include <llvm/Target/TargetMachine.h> |
| #include <tvm/ffi/reflection/registry.h> |
| #if TVM_LLVM_VERSION < 170 |
| #include <llvm/Transforms/IPO/PassManagerBuilder.h> |
| #endif |
| #include <llvm/IR/Module.h> |
| #include <llvm/Transforms/Utils/Cloning.h> |
| #include <tvm/ffi/function.h> |
| #include <tvm/runtime/base.h> |
| #include <tvm/runtime/device_api.h> |
| #include <tvm/runtime/logging.h> |
| |
| #include "../../../../runtime/metadata.h" |
| #include "../../../../target/build_common.h" |
| #include "../../../../target/llvm/codegen_llvm.h" |
| #include "../../../../target/llvm/llvm_instance.h" |
| #include "../rocm_fallback_module.h" |
| |
| namespace tvm { |
| namespace codegen { |
| |
| namespace { |
| |
| // calls the device api to get the max threads per block |
| static inline int DetectROCMmaxThreadsPerBlock() { |
| Device tvm_dev; |
| tvm_dev.device_type = kDLROCM; |
| tvm_dev.device_id = 0; |
| tvm::runtime::DeviceAPI* api = tvm::runtime::DeviceAPI::Get(tvm_dev, true); |
| if (api != nullptr) { |
| ffi::Any val; |
| api->GetAttr(tvm_dev, tvm::runtime::kExist, &val); |
| if (val.cast<int>() == 1) { |
| tvm::runtime::DeviceAPI::Get(tvm_dev)->GetAttr(tvm_dev, tvm::runtime::kMaxThreadsPerBlock, |
| &val); |
| return val.cast<int>(); |
| } |
| } |
| LOG(WARNING) << "Cannot get maximum number of threads for AMD codegen"; |
| return 256; // see the discussion at PR #4342 for the choice of default |
| } |
| |
| } // namespace |
| |
| // AMDGPU code generator. |
| class CodeGenAMDGPU : public CodeGenLLVM { |
| public: |
| CodeGenAMDGPU() = default; |
| virtual ~CodeGenAMDGPU() = default; |
| |
| void AddFunction(const GlobalVar& gvar, const PrimFunc& f) final { |
| // add function as void return value |
| CodeGenLLVM::AddFunctionInternal(gvar, f); |
| function_->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); |
| std::ostringstream attr; |
| attr << "1," << DetectROCMmaxThreadsPerBlock(); |
| function_->addFnAttr("amdgpu-flat-work-group-size", attr.str()); |
| } |
| |
| void VisitStmt_(const AllocBufferNode* op) final { |
| llvm::Value* buf = nullptr; |
| StorageInfo& info = alloc_storage_info_[op->buffer->data.get()]; |
| auto storage_scope = runtime::StorageScope::Create(GetPtrStorageScope(op->buffer->data)); |
| DataType dtype = op->buffer->dtype; |
| |
| if (storage_scope.rank == runtime::StorageRank::kShared && storage_scope.tag == ".dyn") { |
| LOG(WARNING) << "Dynamic shared memory support for rocm is experimental."; |
| buf = AllocateSharedMemory(dtype, 0, 3, std::min(info.alignment, 16), |
| llvm::GlobalValue::ExternalLinkage); |
| } else { |
| const IntImmNode* dim_imm = op->buffer->shape[0].as<IntImmNode>(); |
| TVM_FFI_ICHECK(dim_imm) << "Can only handle constant size stack allocation in GPU"; |
| size_t constant_size = static_cast<size_t>(dim_imm->value); |
| TVM_FFI_ICHECK_GT(constant_size, 0) |
| << "Can only handle constant size stack allocation in GPU"; |
| |
| if (constant_size % 4 == 0 && info.alignment == 0) { |
| info.alignment = GetTempAllocaAlignment(dtype, constant_size); |
| } |
| // maximum necessary alignment in the AMD devices |
| if (info.alignment > 16) { |
| info.alignment = 16; |
| } |
| if (storage_scope.rank == runtime::StorageRank::kLocal) { |
| llvm::AllocaInst* alloca = WithFunctionEntry([&]() { |
| return builder_->CreateAlloca(DTypeToLLVMType(dtype), ConstInt32(constant_size)); |
| }); |
| auto alignment = static_cast<unsigned>(alloca->getAlign().value()); |
| if (alignment < static_cast<unsigned>(info.alignment)) { |
| alloca->setAlignment(llvm::Align(info.alignment)); |
| } |
| buf = alloca; |
| } else { |
| TVM_FFI_ICHECK(storage_scope.rank == runtime::StorageRank::kShared) |
| << "Can only allocate shared or local memory inside kernel"; |
| // Shared memory: address space == 3 |
| buf = AllocateSharedMemory(dtype, constant_size, 3, info.alignment, |
| llvm::GlobalValue::PrivateLinkage); |
| } |
| } |
| |
| buf = builder_->CreatePointerCast( |
| buf, llvmGetPointerTo(DTypeToLLVMType(dtype), buf->getType()->getPointerAddressSpace())); |
| TVM_FFI_ICHECK(!var_map_.count(op->buffer->data.get())); |
| var_map_[op->buffer->data.get()] = buf; |
| if (op->annotations.count(tirx::attr::kVolatile)) { |
| volatile_buf_.insert(op->buffer->data.get()); |
| } |
| } |
| |
| // Return the thread index via intrinsics. |
| llvm::Value* GetThreadIndex(const IterVar& iv) final { |
| runtime::ThreadScope ts = runtime::ThreadScope::Create(iv->thread_tag); |
| llvm::Intrinsic::ID intrin_id = llvm::Intrinsic::amdgcn_workitem_id_x; |
| if (ts.rank == 1) { |
| switch (ts.dim_index) { |
| case 0: |
| intrin_id = llvm::Intrinsic::amdgcn_workitem_id_x; |
| break; |
| case 1: |
| intrin_id = llvm::Intrinsic::amdgcn_workitem_id_y; |
| break; |
| case 2: |
| intrin_id = llvm::Intrinsic::amdgcn_workitem_id_z; |
| break; |
| default: |
| TVM_FFI_THROW(InternalError) << "unknown workitem idx"; |
| } |
| } else { |
| TVM_FFI_ICHECK_EQ(ts.rank, 0); |
| switch (ts.dim_index) { |
| case 0: |
| intrin_id = llvm::Intrinsic::amdgcn_workgroup_id_x; |
| break; |
| case 1: |
| intrin_id = llvm::Intrinsic::amdgcn_workgroup_id_y; |
| break; |
| case 2: |
| intrin_id = llvm::Intrinsic::amdgcn_workgroup_id_z; |
| break; |
| default: |
| TVM_FFI_THROW(InternalError) << "unknown workgroup idx"; |
| } |
| } |
| #if TVM_LLVM_VERSION >= 200 |
| llvm::Function* f = llvm::cast<llvm::Function>( |
| llvm::Intrinsic::getOrInsertDeclaration(module_.get(), intrin_id, {})); |
| #else |
| llvm::Function* f = llvm::Intrinsic::getDeclaration(module_.get(), intrin_id); |
| #endif |
| llvm::Value* result = builder_->CreateCall(f, {}); |
| return this->CreateCast(DataType::Int(32), iv->var->dtype, result); |
| } |
| |
| llvm::Value* CreateStorageSync(const CallNode* op) final { |
| const std::string& sync = op->args[0].as<StringImmNode>()->value; |
| if (sync == "warp") { |
| return nullptr; |
| } else if (sync == "shared") { |
| #if TVM_LLVM_VERSION >= 200 |
| llvm::Function* f = llvm::cast<llvm::Function>(llvm::Intrinsic::getOrInsertDeclaration( |
| module_.get(), llvm::Intrinsic::amdgcn_s_barrier, {})); |
| #else |
| llvm::Function* f = |
| llvm::Intrinsic::getDeclaration(module_.get(), llvm::Intrinsic::amdgcn_s_barrier); |
| #endif |
| return builder_->CreateCall(f, {}); |
| } else { |
| TVM_FFI_THROW(InternalError) << "Do not support sync " << sync; |
| } |
| } |
| |
| #if TVM_LLVM_VERSION < 160 |
| // This function only works with the legacy pass manager. |
| void InitPassManagerBuilder(llvm::PassManagerBuilder* builder) final { |
| // Additional optimization hook to tweak the builder. |
| } |
| #endif |
| |
| unsigned GetGlobalAddressSpace() const final { return 1; } |
| |
| llvm::Value* CreateIntrinsic(const CallNode* op) final { |
| if (op->op.same_as(builtin::atomic_add())) { |
| TVM_FFI_ICHECK(op->args[1]->dtype.bits() == 32) << "Only supports 32 bit atomic for now"; |
| llvm::Value* v0 = MakeValue(op->args[0]); |
| llvm::Value* v1 = MakeValue(op->args[1]); |
| if (op->args[1]->dtype.is_float()) { |
| return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, llvm::MaybeAlign(), |
| llvm::AtomicOrdering::Monotonic); |
| } |
| return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, llvm::MaybeAlign(), |
| llvm::AtomicOrdering::Monotonic); |
| } |
| return CodeGenLLVM::CreateIntrinsic(op); |
| } |
| |
| protected: |
| void InitTarget() final { |
| // Maximum vector lane = float4 |
| native_vector_bits_ = 4 * 32; |
| CodeGenLLVM::InitTarget(); |
| } |
| }; |
| |
| ffi::Module BuildAMDGPU(IRModule mod, Target target) { |
| LLVMInstance llvm_instance; |
| |
| With<LLVMTarget> llvm_target(llvm_instance, target); |
| auto cg = std::make_unique<CodeGenAMDGPU>(); |
| |
| cg->Init("TVMAMDGPUModule", llvm_target.get(), std::nullopt, false, false); |
| |
| cg->AddFunctionsOrdered(mod->functions.begin(), mod->functions.end()); |
| |
| llvm::TargetMachine* tm = llvm_target->GetOrCreateTargetMachine(); |
| auto fbitcode = tvm::ffi::Function::GetGlobalRequired("tvm_callback_rocm_bitcode_path"); |
| auto bitcode_files = fbitcode().cast<ffi::Array<ffi::String>>(); |
| |
| for (auto& bitcode_path : bitcode_files) { |
| std::unique_ptr<llvm::Module> mlib = llvm_instance.LoadIR(bitcode_path); |
| #if TVM_LLVM_VERSION >= 210 |
| mlib->setTargetTriple(llvm::Triple(llvm_target->GetTargetTriple())); |
| #else |
| mlib->setTargetTriple(llvm_target->GetTargetTriple()); |
| #endif |
| mlib->setDataLayout(tm->createDataLayout()); |
| |
| for (llvm::Function& f : mlib->functions()) { |
| f.addFnAttr(llvm::Attribute::AlwaysInline); |
| } |
| cg->AddLinkModule(std::move(mlib)); |
| } |
| |
| std::unique_ptr<llvm::Module> module = cg->Finish(); |
| llvm::SmallString<8> dataObj, data_ll, dataAsm; |
| llvm::raw_svector_ostream destObj(dataObj), dest_ll(data_ll), destAsm(dataAsm); |
| destObj.SetUnbuffered(); |
| dest_ll.SetUnbuffered(); |
| destAsm.SetUnbuffered(); |
| module->print(dest_ll, nullptr); |
| std::unique_ptr<llvm::Module> mAsm = llvm::CloneModule(*module.get()); |
| std::unique_ptr<llvm::Module> mObj = llvm::CloneModule(*module.get()); |
| llvm::legacy::PassManager pass; |
| |
| #if TVM_LLVM_VERSION <= 170 |
| TVM_FFI_ICHECK(tm->addPassesToEmitFile(pass, destObj, nullptr, llvm::CGFT_ObjectFile) == 0) |
| << "Cannot emit target CGFT_ObjectFile"; |
| #else |
| TVM_FFI_ICHECK( |
| tm->addPassesToEmitFile(pass, destObj, nullptr, llvm::CodeGenFileType::ObjectFile) == 0) |
| << "Cannot emit target CodeGenFileType::ObjectFile"; |
| #endif |
| pass.run(*mObj); |
| std::string obj(dataObj.begin(), dataObj.end()); |
| |
| llvm::legacy::PassManager passAsm; |
| #if TVM_LLVM_VERSION <= 170 |
| TVM_FFI_ICHECK(tm->addPassesToEmitFile(passAsm, destAsm, nullptr, llvm::CGFT_AssemblyFile) == 0) |
| << "Cannot emit target CGFT_AssemblyFile"; |
| #else |
| TVM_FFI_ICHECK( |
| tm->addPassesToEmitFile(passAsm, destAsm, nullptr, llvm::CodeGenFileType::AssemblyFile) == 0) |
| << "Cannot emit target CGFT_AssemblyFile"; |
| #endif |
| passAsm.run(*mAsm); |
| std::string assembly(dataAsm.begin(), dataAsm.end()); |
| |
| auto flink = tvm::ffi::Function::GetGlobal("tvm_callback_rocm_link"); |
| TVM_FFI_ICHECK(flink.has_value()) |
| << "Require tvm_callback_rocm_link to exist, do import tvm.support.rocm"; |
| |
| TVMFFIByteArray arr; |
| arr.data = &obj[0]; |
| arr.size = obj.length(); |
| |
| std::string hsaco = (*flink)(&arr).cast<std::string>(); |
| std::string ll(data_ll.begin(), data_ll.end()); |
| ffi::Map<ffi::String, ffi::String> source; |
| source.Set("hip", ffi::String(ll)); |
| source.Set("asm", ffi::String(assembly)); |
| return target::ROCmModuleCreateWithFallback(ffi::Bytes(std::move(hsaco)), "hsaco", |
| ExtractFuncInfo(mod), std::move(source)); |
| } |
| |
| void RegisterAMDGPUCodegen() { |
| static bool registered = false; |
| if (registered) return; |
| registered = true; |
| |
| namespace refl = tvm::ffi::reflection; |
| refl::GlobalDef() |
| .def("target.build.rocm", BuildAMDGPU) |
| .def_packed("tvm.codegen.llvm.target_rocm", [](const ffi::PackedArgs& targs, ffi::Any* rv) { |
| *rv = static_cast<void*>(new CodeGenAMDGPU()); |
| }); |
| } |
| |
| } // namespace codegen |
| } // namespace tvm |
| |
| #endif // TVM_LLVM_VERSION |