tests/python/codegen/test_inject_ptx_ldg32.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 import tvm
 from tvm.script import tir as T
 import numpy as np
 import tvm.testing


 @T.prim_func
 def vector_add(A: T.Buffer((16), "float32"), B: T.Buffer((32), "float32")) -> None:
     T.func_attr({"global_symbol": "default_function", "tir.noalias": True})
     bx = T.env_thread("blockIdx.x")
     tx = T.env_thread("threadIdx.x")
     T.launch_thread(bx, 1)
     T.launch_thread(tx, 32)
     with T.block():
         A_local = T.alloc_buffer((32), "float32", scope="local")

         with T.block():
             T.reads(A[0:16])
             T.writes(A_local[0:32])
             A_local[tx] = T.if_then_else(tx % 2 == 0, A[tx // 2], T.float32(0), dtype="float32")
             B[tx] = A_local[tx] + 1.0


 @tvm.testing.requires_cuda
 def test_inject_ptx_intrin():
     f = vector_add
     arch = tvm.contrib.nvcc.get_target_compute_version()
     major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
     if major < 8:
         # Require at least SM80
         return
     with tvm.transform.PassContext(config={"tir.ptx_ldg32": True}):
         mod = tvm.compile(f, target="cuda")
     A_np = np.random.rand(16).astype("float32")
     B_np = np.zeros((32)).astype("float32")
     dev = tvm.cuda(0)
     A_nd = tvm.runtime.tensor(A_np, device=dev)
     B_nd = tvm.runtime.tensor(B_np, device=dev)
     mod(A_nd, B_nd)

     C_np = np.zeros((32)).astype("float32")

     for i in range(32):
         if i % 2 == 0:
             C_np[i] = A_np[i // 2]
         C_np[i] += 1.0

     tvm.testing.assert_allclose(B_nd.numpy(), C_np)


 if __name__ == "__main__":
     test_inject_ptx_intrin()
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	import tvm
	from tvm.script import tir as T
	import numpy as np
	import tvm.testing


	@T.prim_func
	def vector_add(A: T.Buffer((16), "float32"), B: T.Buffer((32), "float32")) -> None:
	T.func_attr({"global_symbol": "default_function", "tir.noalias": True})
	bx = T.env_thread("blockIdx.x")
	tx = T.env_thread("threadIdx.x")
	T.launch_thread(bx, 1)
	T.launch_thread(tx, 32)
	with T.block():
	A_local = T.alloc_buffer((32), "float32", scope="local")

	with T.block():
	T.reads(A[0:16])
	T.writes(A_local[0:32])
	A_local[tx] = T.if_then_else(tx % 2 == 0, A[tx // 2], T.float32(0), dtype="float32")
	B[tx] = A_local[tx] + 1.0


	@tvm.testing.requires_cuda
	def test_inject_ptx_intrin():
	f = vector_add
	arch = tvm.contrib.nvcc.get_target_compute_version()
	major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
	if major < 8:
	# Require at least SM80
	return
	with tvm.transform.PassContext(config={"tir.ptx_ldg32": True}):
	mod = tvm.compile(f, target="cuda")
	A_np = np.random.rand(16).astype("float32")
	B_np = np.zeros((32)).astype("float32")
	dev = tvm.cuda(0)
	A_nd = tvm.runtime.tensor(A_np, device=dev)
	B_nd = tvm.runtime.tensor(B_np, device=dev)
	mod(A_nd, B_nd)

	C_np = np.zeros((32)).astype("float32")

	for i in range(32):
	if i % 2 == 0:
	C_np[i] = A_np[i // 2]
	C_np[i] += 1.0

	tvm.testing.assert_allclose(B_nd.numpy(), C_np)


	if __name__ == "__main__":
	test_inject_ptx_intrin()