src/target/cuda/llvm/codegen_nvptx.cc - tvm - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file codegen_nvptx.cc
  * \brief NVPTX code generator.
  */
 #ifdef TVM_LLVM_VERSION

 #include <llvm/ADT/SmallString.h>
 #include <llvm/IR/Attributes.h>
 #include <llvm/IR/Function.h>
 #include <llvm/IR/GlobalValue.h>
 #include <llvm/IR/InlineAsm.h>
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Intrinsics.h>
 #include <llvm/IR/IntrinsicsNVPTX.h>
 #include <llvm/IR/LegacyPassManager.h>
 #include <llvm/IR/Metadata.h>
 #include <llvm/IR/Module.h>
 #include <llvm/IR/Type.h>
 #include <llvm/IRReader/IRReader.h>
 #include <llvm/Support/Alignment.h>
 #include <llvm/Support/CodeGen.h>
 #include <llvm/Support/SourceMgr.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Target/TargetMachine.h>
 #include <tvm/ffi/reflection/registry.h>
 #if TVM_LLVM_VERSION < 170
 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
 #endif
 #include <tvm/runtime/device_api.h>

 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>

 #include "../../build_common.h"
 #include "../../llvm/codegen_llvm.h"
 #include "../../llvm/llvm_instance.h"
 #include "../cuda_fallback_module.h"

 namespace tvm {
 namespace codegen {

 // NVPTX code generator.
 class CodeGenNVPTX : public CodeGenLLVM {
  public:
   llvm::Function* DeclareFunction(const GlobalVar& gvar, const PrimFunc& f) final {
     // add function as void return value
     return CodeGenLLVM::DeclareFunctionInternal(gvar, f);
   }
   void AddFunction(const GlobalVar& gvar, const PrimFunc& f) final {
     // add function as void return value
     CodeGenLLVM::AddFunctionInternal(gvar, f);
     // annotate as kernel function
     llvm::LLVMContext* ctx = llvm_target_->GetContext();
     module_->getOrInsertNamedMetadata("nvvm.annotations")
         ->addOperand(llvm::MDNode::get(
             *ctx, {llvm::ValueAsMetadata::get(function_), llvm::MDString::get(*ctx, "kernel"),
                    llvm::ValueAsMetadata::get(ConstInt32(1))}));
   }

   void VisitStmt_(const AllocBufferNode* op) final {
     llvm::Value* buf = nullptr;
     StorageInfo& info = alloc_storage_info_[op->buffer->data.get()];
     // maximum necessary alignment in the NV devices
     if (info.alignment > 16) {
       info.alignment = 16;
     }

     auto storage_scope = runtime::StorageScope::Create(GetPtrStorageScope(op->buffer->data));
     DataType dtype = op->buffer->dtype;

     if (storage_scope.rank == runtime::StorageRank::kShared && storage_scope.tag == ".dyn") {
       // Shared memory: address space == 3
       buf = AllocateSharedMemory(dtype, 0, 3, info.alignment, llvm::GlobalValue::ExternalLinkage);
     } else {
       // Compute constant_size from buffer shape
       const IntImmNode* dim_imm = op->buffer->shape[0].as<IntImmNode>();
       TVM_FFI_ICHECK(dim_imm) << "Can only handle constant size stack allocation in GPU";
       size_t constant_size = static_cast<size_t>(dim_imm->value);
       TVM_FFI_ICHECK_GT(constant_size, 0)
           << "Can only handle constant size stack allocation in GPU";

       if (constant_size % 4 == 0 && info.alignment == 0) {
         info.alignment = GetTempAllocaAlignment(dtype, constant_size);
       }
       if (storage_scope.rank == runtime::StorageRank::kLocal) {
         llvm::AllocaInst* alloca = WithFunctionEntry([&]() {
           return builder_->CreateAlloca(DTypeToLLVMType(dtype), ConstInt32(constant_size));
         });
         auto alignment = static_cast<unsigned>(alloca->getAlign().value());
         if (alignment < static_cast<unsigned>(info.alignment)) {
           alloca->setAlignment(llvm::Align(info.alignment));
         }
         buf = alloca;
       } else {
         TVM_FFI_ICHECK(storage_scope.rank == runtime::StorageRank::kShared)
             << "Can only allocate shared or local memory inside kernel";
         buf = AllocateSharedMemory(dtype, constant_size, 3, info.alignment,
                                    llvm::GlobalValue::ExternalLinkage);
       }
     }

     buf = builder_->CreatePointerCast(
         buf, llvmGetPointerTo(DTypeToLLVMType(dtype), buf->getType()->getPointerAddressSpace()));
     TVM_FFI_ICHECK(!var_map_.count(op->buffer->data.get()));
     var_map_[op->buffer->data.get()] = buf;
     if (op->annotations.count(tirx::attr::kVolatile)) {
       volatile_buf_.insert(op->buffer->data.get());
     }
   }

   // Return the thread index via intrinsics.
   llvm::Value* GetThreadIndex(const IterVar& iv) final {
     runtime::ThreadScope ts = runtime::ThreadScope::Create(iv->thread_tag);
     llvm::Intrinsic::ID intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x;
     if (ts.rank == 1) {
       switch (ts.dim_index) {
         case 0:
           intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x;
           break;
         case 1:
           intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_tid_y;
           break;
         case 2:
           intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_tid_z;
           break;
         default:
           TVM_FFI_THROW(InternalError) << "unknown thread idx";
       }
     } else {
       TVM_FFI_ICHECK_EQ(ts.rank, 0);
       switch (ts.dim_index) {
         case 0:
           intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
           break;
         case 1:
           intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
           break;
         case 2:
           intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_z;
           break;
         default:
           TVM_FFI_THROW(InternalError) << "unknown thread idx";
       }
     }
 #if TVM_LLVM_VERSION >= 200
     llvm::Function* f = llvm::cast<llvm::Function>(
         llvm::Intrinsic::getOrInsertDeclaration(module_.get(), intrin_id, {}));
 #else
     llvm::Function* f = llvm::Intrinsic::getDeclaration(module_.get(), intrin_id);
 #endif
     return builder_->CreateCall(f, {});
   }

   llvm::Value* CreateStorageSync(const CallNode* op) final {
     const std::string& sync = op->args[0].as<StringImmNode>()->value;
     if (sync == "warp") {
       // TODO(tqchen) warp sync in CUDA9
       return nullptr;
     } else if (sync == "shared" || sync == "shared.dyn") {
 #if TVM_LLVM_VERSION >= 200
       llvm::Function* f = llvm::cast<llvm::Function>(llvm::Intrinsic::getOrInsertDeclaration(
 #if TVM_LLVM_VERSION >= 210
           module_.get(), llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_all, {}));
 #else
           module_.get(), llvm::Intrinsic::nvvm_barrier0, {}));
 #endif
 #else
       llvm::Function* f =
           llvm::Intrinsic::getDeclaration(module_.get(), llvm::Intrinsic::nvvm_barrier0);
 #endif
       return builder_->CreateCall(f, {});
     } else {
       TVM_FFI_THROW(InternalError) << "Do not support sync " << sync;
     }
   }

 #if TVM_LLVM_VERSION < 160
   // This function only works with the legacy pass manager.
   void InitPassManagerBuilder(llvm::PassManagerBuilder* builder) final {
     // Additional optimization hook to tweak the builder.
   }
 #endif

   void Optimize() final {
     for (auto& f : *module_) {
       auto fname = static_cast<std::string>(f.getName());
       if (fname.substr(0, 4) != "__nv") continue;
       // This is to strip off unused __nv_* functions from the final module
       // The one that is actually used will be inlined at call site
       // Adapted from Halide's runtime linker
       if (!f.isDeclaration() && !f.hasFnAttribute(llvm::Attribute::NoInline)) {
         f.setLinkage(llvm::GlobalValue::AvailableExternallyLinkage);
       }
     }
     CodeGenLLVM::Optimize();
   }

   llvm::Value* CreateIntrinsic(const CallNode* op) override;

  protected:
   void InitTarget() final {
     // Maximum vector lane = float4
     native_vector_bits_ = 4 * 32;
     CodeGenLLVM::InitTarget();
   }
 };

 // Check if this is a warp shuffle intrinsic call and match its
 // corresponding nvvm intrinsic. Return true if the match is successful.
 static bool GetWarpShuffleIntrinsic(const CallNode* op, llvm::Intrinsic::ID* id) {
   // Only 32 bit data type is supported.
   if (op->dtype.is_fixed_length_vector() || op->dtype.bits() != 32) {
     return false;
   }

   // Intrinsic lookup table.
   // It is difficult to emit _sync verion that works on Pascal.
   // We ignore the mask and only emit the non-sync version for nvptx.
   llvm::Intrinsic::ID ids[] = {
       llvm::Intrinsic::nvvm_shfl_idx_i32,  llvm::Intrinsic::nvvm_shfl_idx_f32,
       llvm::Intrinsic::nvvm_shfl_up_i32,   llvm::Intrinsic::nvvm_shfl_up_f32,
       llvm::Intrinsic::nvvm_shfl_down_i32, llvm::Intrinsic::nvvm_shfl_down_f32};

   int offset = 0;
   if (op->op.same_as(builtin::tvm_warp_shuffle())) {
     offset = 0;
   } else if (op->op.same_as(builtin::tvm_warp_shuffle_up())) {
     offset = 2;
   } else if (op->op.same_as(builtin::tvm_warp_shuffle_down())) {
     offset = 4;
   } else {
     return false;
   }

   *id = ids[offset + op->dtype.is_float()];
   return true;
 }

 llvm::Value* CodeGenNVPTX::CreateIntrinsic(const CallNode* op) {
   llvm::Intrinsic::ID id = llvm::Intrinsic::not_intrinsic;
   if (GetWarpShuffleIntrinsic(op, &id)) {
     std::vector<llvm::Value*> arg_value;
     std::vector<llvm::Type*> arg_type;
     // Ignore the first mask operand and remove the last
     // redundant warp_size..
     size_t n_args = op->args.size() - 1;
     for (size_t i = 1; i < n_args; ++i) {
       arg_value.push_back(MakeValue(op->args[i]));
       arg_type.push_back(arg_value.back()->getType());
     }
     llvm::Type* return_type = arg_type[0];
     llvm::Function* func = GetIntrinsicDecl(id, return_type, arg_type);
     return builder_->CreateCall(func, arg_value);
   } else if (op->op.same_as(builtin::tvm_warp_activemask())) {
     // Only nvptx target may keep this intrinsic at this point.
     // PTX assembly: asm "activemask.b32 r1;"
     auto fty = llvm::FunctionType::get(t_int32_, false);
     auto val = llvm::InlineAsm::get(fty, "activemask.b32 %0", "=r", true);
     return builder_->CreateCall(val);
   } else if (op->op.same_as(builtin::atomic_add())) {
     TVM_FFI_ICHECK(op->args[1]->dtype.bits() == 32) << "Only supports 32 bit atomic for now";
     llvm::Value* v0 = MakeValue(op->args[0]);
     llvm::Value* v1 = MakeValue(op->args[1]);
     if (op->args[1]->dtype.is_float()) {
       return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, llvm::MaybeAlign(),
                                        llvm::AtomicOrdering::Monotonic);
     }
     return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, llvm::MaybeAlign(),
                                      llvm::AtomicOrdering::Monotonic);
   }
   return CodeGenLLVM::CreateIntrinsic(op);
 }

 int GetCUDAComputeVersion(const Target& target) {
   ffi::Optional<ffi::String> mcpu = target->GetAttr<ffi::String>("mcpu");
   TVM_FFI_CHECK(mcpu.has_value(), InternalError) << "\"-mcpu\" is undefined in the NVPTX target";
   std::string sm_version = mcpu.value();
   return std::stoi(sm_version.substr(3));
 }

 ffi::Module BuildNVPTX(IRModule mod, Target target) {
   LLVMInstance llvm_instance;
   With<LLVMTarget> llvm_target(llvm_instance, target);

   int compute_ver = GetCUDAComputeVersion(target);
   auto cg = std::make_unique<CodeGenNVPTX>();

   cg->Init("TVMPTXModule", llvm_target.get(), std::nullopt, false, false);

   cg->AddFunctionsOrdered(mod->functions.begin(), mod->functions.end());

   llvm::TargetMachine* tm = llvm_target->GetOrCreateTargetMachine();
   const auto flibdevice_path = tvm::ffi::Function::GetGlobal("tvm_callback_libdevice_path");
   if (flibdevice_path.has_value()) {
     std::string path = (*flibdevice_path)(compute_ver).cast<std::string>();
     if (path.length() != 0) {
       std::unique_ptr<llvm::Module> mlib = llvm_instance.LoadIR(path);
 #if TVM_LLVM_VERSION >= 210
       mlib->setTargetTriple(llvm::Triple(llvm_target->GetTargetTriple()));
 #else
       mlib->setTargetTriple(llvm_target->GetTargetTriple());
 #endif
       mlib->setDataLayout(tm->createDataLayout());
       cg->AddLinkModule(std::move(mlib));
     }
   }
   std::unique_ptr<llvm::Module> module = cg->Finish();
   llvm::SmallString<8> data_ptx, data_ll;
   llvm::raw_svector_ostream dest_ptx(data_ptx), dest_ll(data_ll);
   dest_ptx.SetUnbuffered();
   dest_ll.SetUnbuffered();
   // print ll
   module->print(dest_ll, nullptr);
   std::string ll(data_ll.begin(), data_ll.end());
   // emit ptx
   llvm::legacy::PassManager pass;
 #if TVM_LLVM_VERSION <= 170
   TVM_FFI_ICHECK(tm->addPassesToEmitFile(pass, dest_ptx, nullptr, llvm::CGFT_AssemblyFile) == 0)
       << "Cannot emit target CGFT_ObjectFile";
 #else
   TVM_FFI_ICHECK(
       tm->addPassesToEmitFile(pass, dest_ptx, nullptr, llvm::CodeGenFileType::AssemblyFile) == 0)
       << "Cannot emit target CodeGenFileType::ObjectFile";
 #endif
   pass.run(*module);
   std::string ptx(data_ptx.begin(), data_ptx.end());
   // BuildNVPTX produces PTX directly via the LLVM AMDGPU backend; hand it to
   // the fallback-aware factory.  Source map is `{"ll": ll}` so InspectSource
   // can recover the LLVM IR even when the receiver only has a fallback module.
   ffi::Map<ffi::String, ffi::String> source_map;
   source_map.Set("ll", ll);
   return target::CUDAModuleCreateWithFallback(ffi::Bytes(ptx.data(), ptx.size()),
                                               ffi::String("ptx"), ExtractFuncInfo(mod), source_map);
 }

 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def("target.build.nvptx", BuildNVPTX)
       .def_packed("tvm.codegen.llvm.target_nvptx", [](const ffi::PackedArgs& targs, ffi::Any* rv) {
         *rv = static_cast<void*>(new CodeGenNVPTX());
       });
 }

 }  // namespace codegen
 }  // namespace tvm

 #endif  // TVM_LLVM_VERSION
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file codegen_nvptx.cc
	* \brief NVPTX code generator.
	*/
	#ifdef TVM_LLVM_VERSION

	#include <llvm/ADT/SmallString.h>
	#include <llvm/IR/Attributes.h>
	#include <llvm/IR/Function.h>
	#include <llvm/IR/GlobalValue.h>
	#include <llvm/IR/InlineAsm.h>
	#include <llvm/IR/Instructions.h>
	#include <llvm/IR/Intrinsics.h>
	#include <llvm/IR/IntrinsicsNVPTX.h>
	#include <llvm/IR/LegacyPassManager.h>
	#include <llvm/IR/Metadata.h>
	#include <llvm/IR/Module.h>
	#include <llvm/IR/Type.h>
	#include <llvm/IRReader/IRReader.h>
	#include <llvm/Support/Alignment.h>
	#include <llvm/Support/CodeGen.h>
	#include <llvm/Support/SourceMgr.h>
	#include <llvm/Support/raw_ostream.h>
	#include <llvm/Target/TargetMachine.h>
	#include <tvm/ffi/reflection/registry.h>
	#if TVM_LLVM_VERSION < 170
	#include <llvm/Transforms/IPO/PassManagerBuilder.h>
	#endif
	#include <tvm/runtime/device_api.h>

	#include <memory>
	#include <string>
	#include <utility>
	#include <vector>

	#include "../../build_common.h"
	#include "../../llvm/codegen_llvm.h"
	#include "../../llvm/llvm_instance.h"
	#include "../cuda_fallback_module.h"

	namespace tvm {
	namespace codegen {

	// NVPTX code generator.
	class CodeGenNVPTX : public CodeGenLLVM {
	public:
	llvm::Function* DeclareFunction(const GlobalVar& gvar, const PrimFunc& f) final {
	// add function as void return value
	return CodeGenLLVM::DeclareFunctionInternal(gvar, f);
	}
	void AddFunction(const GlobalVar& gvar, const PrimFunc& f) final {
	// add function as void return value
	CodeGenLLVM::AddFunctionInternal(gvar, f);
	// annotate as kernel function
	llvm::LLVMContext* ctx = llvm_target_->GetContext();
	module_->getOrInsertNamedMetadata("nvvm.annotations")
	->addOperand(llvm::MDNode::get(
	ctx, {llvm::ValueAsMetadata::get(function_), llvm::MDString::get(ctx, "kernel"),
	llvm::ValueAsMetadata::get(ConstInt32(1))}));
	}

	void VisitStmt_(const AllocBufferNode* op) final {
	llvm::Value* buf = nullptr;
	StorageInfo& info = alloc_storage_info_[op->buffer->data.get()];
	// maximum necessary alignment in the NV devices
	if (info.alignment > 16) {
	info.alignment = 16;
	}

	auto storage_scope = runtime::StorageScope::Create(GetPtrStorageScope(op->buffer->data));
	DataType dtype = op->buffer->dtype;

	if (storage_scope.rank == runtime::StorageRank::kShared && storage_scope.tag == ".dyn") {
	// Shared memory: address space == 3
	buf = AllocateSharedMemory(dtype, 0, 3, info.alignment, llvm::GlobalValue::ExternalLinkage);
	} else {
	// Compute constant_size from buffer shape
	const IntImmNode* dim_imm = op->buffer->shape[0].as<IntImmNode>();
	TVM_FFI_ICHECK(dim_imm) << "Can only handle constant size stack allocation in GPU";
	size_t constant_size = static_cast<size_t>(dim_imm->value);
	TVM_FFI_ICHECK_GT(constant_size, 0)
	<< "Can only handle constant size stack allocation in GPU";

	if (constant_size % 4 == 0 && info.alignment == 0) {
	info.alignment = GetTempAllocaAlignment(dtype, constant_size);
	}
	if (storage_scope.rank == runtime::StorageRank::kLocal) {
	llvm::AllocaInst* alloca = WithFunctionEntry([&]() {
	return builder_->CreateAlloca(DTypeToLLVMType(dtype), ConstInt32(constant_size));
	});
	auto alignment = static_cast<unsigned>(alloca->getAlign().value());
	if (alignment < static_cast<unsigned>(info.alignment)) {
	alloca->setAlignment(llvm::Align(info.alignment));
	}
	buf = alloca;
	} else {
	TVM_FFI_ICHECK(storage_scope.rank == runtime::StorageRank::kShared)
	<< "Can only allocate shared or local memory inside kernel";
	buf = AllocateSharedMemory(dtype, constant_size, 3, info.alignment,
	llvm::GlobalValue::ExternalLinkage);
	}
	}

	buf = builder_->CreatePointerCast(
	buf, llvmGetPointerTo(DTypeToLLVMType(dtype), buf->getType()->getPointerAddressSpace()));
	TVM_FFI_ICHECK(!var_map_.count(op->buffer->data.get()));
	var_map_[op->buffer->data.get()] = buf;
	if (op->annotations.count(tirx::attr::kVolatile)) {
	volatile_buf_.insert(op->buffer->data.get());
	}
	}

	// Return the thread index via intrinsics.
	llvm::Value* GetThreadIndex(const IterVar& iv) final {
	runtime::ThreadScope ts = runtime::ThreadScope::Create(iv->thread_tag);
	llvm::Intrinsic::ID intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x;
	if (ts.rank == 1) {
	switch (ts.dim_index) {
	case 0:
	intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x;
	break;
	case 1:
	intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_tid_y;
	break;
	case 2:
	intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_tid_z;
	break;
	default:
	TVM_FFI_THROW(InternalError) << "unknown thread idx";
	}
	} else {
	TVM_FFI_ICHECK_EQ(ts.rank, 0);
	switch (ts.dim_index) {
	case 0:
	intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
	break;
	case 1:
	intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
	break;
	case 2:
	intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_z;
	break;
	default:
	TVM_FFI_THROW(InternalError) << "unknown thread idx";
	}
	}
	#if TVM_LLVM_VERSION >= 200
	llvm::Function* f = llvm::cast<llvm::Function>(
	llvm::Intrinsic::getOrInsertDeclaration(module_.get(), intrin_id, {}));
	#else
	llvm::Function* f = llvm::Intrinsic::getDeclaration(module_.get(), intrin_id);
	#endif
	return builder_->CreateCall(f, {});
	}

	llvm::Value* CreateStorageSync(const CallNode* op) final {
	const std::string& sync = op->args[0].as<StringImmNode>()->value;
	if (sync == "warp") {
	// TODO(tqchen) warp sync in CUDA9
	return nullptr;
	} else if (sync == "shared" \|\| sync == "shared.dyn") {
	#if TVM_LLVM_VERSION >= 200
	llvm::Function* f = llvm::cast<llvm::Function>(llvm::Intrinsic::getOrInsertDeclaration(
	#if TVM_LLVM_VERSION >= 210
	module_.get(), llvm::Intrinsic::nvvm_barrier_cta_sync_aligned_all, {}));
	#else
	module_.get(), llvm::Intrinsic::nvvm_barrier0, {}));
	#endif
	#else
	llvm::Function* f =
	llvm::Intrinsic::getDeclaration(module_.get(), llvm::Intrinsic::nvvm_barrier0);
	#endif
	return builder_->CreateCall(f, {});
	} else {
	TVM_FFI_THROW(InternalError) << "Do not support sync " << sync;
	}
	}

	#if TVM_LLVM_VERSION < 160
	// This function only works with the legacy pass manager.
	void InitPassManagerBuilder(llvm::PassManagerBuilder* builder) final {
	// Additional optimization hook to tweak the builder.
	}
	#endif

	void Optimize() final {
	for (auto& f : *module_) {
	auto fname = static_cast<std::string>(f.getName());
	if (fname.substr(0, 4) != "__nv") continue;
	// This is to strip off unused __nv_* functions from the final module
	// The one that is actually used will be inlined at call site
	// Adapted from Halide's runtime linker
	if (!f.isDeclaration() && !f.hasFnAttribute(llvm::Attribute::NoInline)) {
	f.setLinkage(llvm::GlobalValue::AvailableExternallyLinkage);
	}
	}
	CodeGenLLVM::Optimize();
	}

	llvm::Value* CreateIntrinsic(const CallNode* op) override;

	protected:
	void InitTarget() final {
	// Maximum vector lane = float4
	native_vector_bits_ = 4 * 32;
	CodeGenLLVM::InitTarget();
	}
	};

	// Check if this is a warp shuffle intrinsic call and match its
	// corresponding nvvm intrinsic. Return true if the match is successful.
	static bool GetWarpShuffleIntrinsic(const CallNode* op, llvm::Intrinsic::ID* id) {
	// Only 32 bit data type is supported.
	if (op->dtype.is_fixed_length_vector() \|\| op->dtype.bits() != 32) {
	return false;
	}

	// Intrinsic lookup table.
	// It is difficult to emit _sync verion that works on Pascal.
	// We ignore the mask and only emit the non-sync version for nvptx.
	llvm::Intrinsic::ID ids[] = {
	llvm::Intrinsic::nvvm_shfl_idx_i32, llvm::Intrinsic::nvvm_shfl_idx_f32,
	llvm::Intrinsic::nvvm_shfl_up_i32, llvm::Intrinsic::nvvm_shfl_up_f32,
	llvm::Intrinsic::nvvm_shfl_down_i32, llvm::Intrinsic::nvvm_shfl_down_f32};

	int offset = 0;
	if (op->op.same_as(builtin::tvm_warp_shuffle())) {
	offset = 0;
	} else if (op->op.same_as(builtin::tvm_warp_shuffle_up())) {
	offset = 2;
	} else if (op->op.same_as(builtin::tvm_warp_shuffle_down())) {
	offset = 4;
	} else {
	return false;
	}

	*id = ids[offset + op->dtype.is_float()];
	return true;
	}

	llvm::Value* CodeGenNVPTX::CreateIntrinsic(const CallNode* op) {
	llvm::Intrinsic::ID id = llvm::Intrinsic::not_intrinsic;
	if (GetWarpShuffleIntrinsic(op, &id)) {
	std::vector<llvm::Value*> arg_value;
	std::vector<llvm::Type*> arg_type;
	// Ignore the first mask operand and remove the last
	// redundant warp_size..
	size_t n_args = op->args.size() - 1;
	for (size_t i = 1; i < n_args; ++i) {
	arg_value.push_back(MakeValue(op->args[i]));
	arg_type.push_back(arg_value.back()->getType());
	}
	llvm::Type* return_type = arg_type[0];
	llvm::Function* func = GetIntrinsicDecl(id, return_type, arg_type);
	return builder_->CreateCall(func, arg_value);
	} else if (op->op.same_as(builtin::tvm_warp_activemask())) {
	// Only nvptx target may keep this intrinsic at this point.
	// PTX assembly: asm "activemask.b32 r1;"
	auto fty = llvm::FunctionType::get(t_int32_, false);
	auto val = llvm::InlineAsm::get(fty, "activemask.b32 %0", "=r", true);
	return builder_->CreateCall(val);
	} else if (op->op.same_as(builtin::atomic_add())) {
	TVM_FFI_ICHECK(op->args[1]->dtype.bits() == 32) << "Only supports 32 bit atomic for now";
	llvm::Value* v0 = MakeValue(op->args[0]);
	llvm::Value* v1 = MakeValue(op->args[1]);
	if (op->args[1]->dtype.is_float()) {
	return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, llvm::MaybeAlign(),
	llvm::AtomicOrdering::Monotonic);
	}
	return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, llvm::MaybeAlign(),
	llvm::AtomicOrdering::Monotonic);
	}
	return CodeGenLLVM::CreateIntrinsic(op);
	}

	int GetCUDAComputeVersion(const Target& target) {
	ffi::Optional<ffi::String> mcpu = target->GetAttr<ffi::String>("mcpu");
	TVM_FFI_CHECK(mcpu.has_value(), InternalError) << "\"-mcpu\" is undefined in the NVPTX target";
	std::string sm_version = mcpu.value();
	return std::stoi(sm_version.substr(3));
	}

	ffi::Module BuildNVPTX(IRModule mod, Target target) {
	LLVMInstance llvm_instance;
	With<LLVMTarget> llvm_target(llvm_instance, target);

	int compute_ver = GetCUDAComputeVersion(target);
	auto cg = std::make_unique<CodeGenNVPTX>();

	cg->Init("TVMPTXModule", llvm_target.get(), std::nullopt, false, false);

	cg->AddFunctionsOrdered(mod->functions.begin(), mod->functions.end());

	llvm::TargetMachine* tm = llvm_target->GetOrCreateTargetMachine();
	const auto flibdevice_path = tvm::ffi::Function::GetGlobal("tvm_callback_libdevice_path");
	if (flibdevice_path.has_value()) {
	std::string path = (*flibdevice_path)(compute_ver).cast<std::string>();
	if (path.length() != 0) {
	std::unique_ptr<llvm::Module> mlib = llvm_instance.LoadIR(path);
	#if TVM_LLVM_VERSION >= 210
	mlib->setTargetTriple(llvm::Triple(llvm_target->GetTargetTriple()));
	#else
	mlib->setTargetTriple(llvm_target->GetTargetTriple());
	#endif
	mlib->setDataLayout(tm->createDataLayout());
	cg->AddLinkModule(std::move(mlib));
	}
	}
	std::unique_ptr<llvm::Module> module = cg->Finish();
	llvm::SmallString<8> data_ptx, data_ll;
	llvm::raw_svector_ostream dest_ptx(data_ptx), dest_ll(data_ll);
	dest_ptx.SetUnbuffered();
	dest_ll.SetUnbuffered();
	// print ll
	module->print(dest_ll, nullptr);
	std::string ll(data_ll.begin(), data_ll.end());
	// emit ptx
	llvm::legacy::PassManager pass;
	#if TVM_LLVM_VERSION <= 170
	TVM_FFI_ICHECK(tm->addPassesToEmitFile(pass, dest_ptx, nullptr, llvm::CGFT_AssemblyFile) == 0)
	<< "Cannot emit target CGFT_ObjectFile";
	#else
	TVM_FFI_ICHECK(
	tm->addPassesToEmitFile(pass, dest_ptx, nullptr, llvm::CodeGenFileType::AssemblyFile) == 0)
	<< "Cannot emit target CodeGenFileType::ObjectFile";
	#endif
	pass.run(*module);
	std::string ptx(data_ptx.begin(), data_ptx.end());
	// BuildNVPTX produces PTX directly via the LLVM AMDGPU backend; hand it to
	// the fallback-aware factory. Source map is `{"ll": ll}` so InspectSource
	// can recover the LLVM IR even when the receiver only has a fallback module.
	ffi::Map<ffi::String, ffi::String> source_map;
	source_map.Set("ll", ll);
	return target::CUDAModuleCreateWithFallback(ffi::Bytes(ptx.data(), ptx.size()),
	ffi::String("ptx"), ExtractFuncInfo(mod), source_map);
	}

	TVM_FFI_STATIC_INIT_BLOCK() {
	namespace refl = tvm::ffi::reflection;
	refl::GlobalDef()
	.def("target.build.nvptx", BuildNVPTX)
	.def_packed("tvm.codegen.llvm.target_nvptx", [](const ffi::PackedArgs& targs, ffi::Any* rv) {
	rv = static_cast<void>(new CodeGenNVPTX());
	});
	}

	} // namespace codegen
	} // namespace tvm

	#endif // TVM_LLVM_VERSION