tests/python/relax/test_tir_call_source_kernel.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import numpy as np

 import tvm
 import tvm.testing
 from tvm import relax
 from tvm.script import ir as I
 from tvm.script import relax as R
 from tvm.script import tir as T

 add_cuda_source = """
 extern "C" __global__ void add_kernel(float* x, float* y, float* output, int n_elements) {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i < n_elements) {
         output[i] = x[i] + y[i];
     }
 }
 """


 @tvm.testing.requires_cuda
 def test_tir_call_source_kernel():
     @I.ir_module
     class Module:
         @T.prim_func
         def add(x_handle: T.handle, y_handle: T.handle, output_handle: T.handle) -> None:
             T.func_attr({"global_symbol": "add"})
             m = T.int64()
             x = T.match_buffer(x_handle, (m,), "float32")
             y = T.match_buffer(y_handle, (m,), "float32")
             output = T.match_buffer(output_handle, (m,), "float32")
             with T.block("root"):
                 T.reads(x[0:m], y[0:m])
                 T.writes(output[0:m])
                 BLOCK_SIZE = T.meta_var(64)
                 T.call_kernel(
                     add_cuda_source,
                     ((T.ceildiv(m, BLOCK_SIZE),), (BLOCK_SIZE,)),
                     x.data,
                     y.data,
                     output.data,
                     m,
                     kernel_name="add_kernel",
                 )

         @R.function
         def main(x: R.Tensor(("m",), "float32"), y: R.Tensor(("m",), "float32")):
             m = T.int64()
             with R.dataflow():
                 output = R.call_tir(Module.add, [x, y], relax.TensorStructInfo((m,), "float32"))
                 R.output(output)
             return output

     @I.ir_module
     class Parsed:
         @T.prim_func
         def add(x_handle: T.handle, y_handle: T.handle, output_handle: T.handle):
             m = T.int64()
             x = T.match_buffer(x_handle, (m,))
             y = T.match_buffer(y_handle, (m,))
             output = T.match_buffer(output_handle, (m,))
             with T.block("root"):
                 T.reads(x[0:m], y[0:m])
                 T.writes(output[0:m])
                 T.call_packed(
                     "add_kernel",
                     x.data,
                     y.data,
                     output.data,
                     m,
                     (m + T.int64(64) - T.int64(1)) // T.int64(64),
                     64,
                 )

     tvm.ir.assert_structural_equal(Module["add"], Parsed["add"])
     assert len(Module.get_attr("external_mods")) == 1

     device = tvm.cuda(0)
     x_nd = tvm.runtime.tensor(np.random.rand(256).astype(np.float32), device)
     y_nd = tvm.runtime.tensor(np.random.rand(256).astype(np.float32), device)
     output_np = x_nd.numpy() + y_nd.numpy()

     with tvm.target.Target("cuda"):
         lib = tvm.compile(Module)
         output_nd = tvm.runtime.vm.VirtualMachine(lib, device)["main"](x_nd, y_nd)
         tvm.testing.assert_allclose(output_nd.numpy(), output_np, rtol=1e-5)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	import numpy as np

	import tvm
	import tvm.testing
	from tvm import relax
	from tvm.script import ir as I
	from tvm.script import relax as R
	from tvm.script import tir as T

	add_cuda_source = """
	extern "C" __global__ void add_kernel(float* x, float* y, float* output, int n_elements) {
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i < n_elements) {
	output[i] = x[i] + y[i];
	}
	}
	"""


	@tvm.testing.requires_cuda
	def test_tir_call_source_kernel():
	@I.ir_module
	class Module:
	@T.prim_func
	def add(x_handle: T.handle, y_handle: T.handle, output_handle: T.handle) -> None:
	T.func_attr({"global_symbol": "add"})
	m = T.int64()
	x = T.match_buffer(x_handle, (m,), "float32")
	y = T.match_buffer(y_handle, (m,), "float32")
	output = T.match_buffer(output_handle, (m,), "float32")
	with T.block("root"):
	T.reads(x[0:m], y[0:m])
	T.writes(output[0:m])
	BLOCK_SIZE = T.meta_var(64)
	T.call_kernel(
	add_cuda_source,
	((T.ceildiv(m, BLOCK_SIZE),), (BLOCK_SIZE,)),
	x.data,
	y.data,
	output.data,
	m,
	kernel_name="add_kernel",
	)

	@R.function
	def main(x: R.Tensor(("m",), "float32"), y: R.Tensor(("m",), "float32")):
	m = T.int64()
	with R.dataflow():
	output = R.call_tir(Module.add, [x, y], relax.TensorStructInfo((m,), "float32"))
	R.output(output)
	return output

	@I.ir_module
	class Parsed:
	@T.prim_func
	def add(x_handle: T.handle, y_handle: T.handle, output_handle: T.handle):
	m = T.int64()
	x = T.match_buffer(x_handle, (m,))
	y = T.match_buffer(y_handle, (m,))
	output = T.match_buffer(output_handle, (m,))
	with T.block("root"):
	T.reads(x[0:m], y[0:m])
	T.writes(output[0:m])
	T.call_packed(
	"add_kernel",
	x.data,
	y.data,
	output.data,
	m,
	(m + T.int64(64) - T.int64(1)) // T.int64(64),
	64,
	)

	tvm.ir.assert_structural_equal(Module["add"], Parsed["add"])
	assert len(Module.get_attr("external_mods")) == 1

	device = tvm.cuda(0)
	x_nd = tvm.runtime.tensor(np.random.rand(256).astype(np.float32), device)
	y_nd = tvm.runtime.tensor(np.random.rand(256).astype(np.float32), device)
	output_np = x_nd.numpy() + y_nd.numpy()

	with tvm.target.Target("cuda"):
	lib = tvm.compile(Module)
	output_nd = tvm.runtime.vm.VirtualMachine(lib, device)["main"](x_nd, y_nd)
	tvm.testing.assert_allclose(output_nd.numpy(), output_np, rtol=1e-5)