tests/python/relax/test_vm_multi_device.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 """Test eliminate common subexpr pass"""

 import numpy as np

 import tvm
 import tvm.testing
 from tvm import relax
 from tvm.ir.module import IRModule
 from tvm.runtime import Device
 from tvm.script.parser import ir as I
 from tvm.script.parser import relax as R


 def compile(
     mod: IRModule,
     device: list[Device] = [
         tvm.cpu(),
     ],
 ) -> relax.VirtualMachine:
     # compile the model
     mod = relax.transform.RealizeVDevice()(mod)
     mod = relax.transform.LegalizeOps()(mod)
     mod = tvm.s_tir.transform.DefaultGPUSchedule()(mod)
     # no need to feed target argument for mult-target compilation
     ex = tvm.compile(mod)

     return relax.VirtualMachine(ex, device)


 def test_multi_cpu():
     @I.ir_module
     class Example:
         I.module_attrs({"attr": 10})
         I.module_global_infos(
             {
                 "vdevice": [
                     I.vdevice("llvm", 0),
                     I.vdevice("llvm", 1),
                 ]
             }
         )

         @R.function
         def foo(
             x: R.Tensor((2, 3), "float32"),
             y: R.Tensor((3, 4), "float32"),
             z: R.Tensor((4, 5), "float32"),
         ) -> R.Tensor((2, 5), "float32"):
             with R.dataflow():
                 lv0 = R.matmul(x, y)
                 lv0 = R.hint_on_device(lv0, tvm.cpu(0))
                 lv1: R.Tensor((2, 4), "float32", "llvm:1") = R.to_vdevice(lv0, "llvm:1")
                 gv = R.matmul(lv1, z)
                 R.output(gv)
             return gv

     devices = [tvm.cpu(0), tvm.cpu(1)]
     vm = compile(Example, devices)

     np_ipt0 = np.random.rand(2, 3).astype(np.float32)
     np_ipt1 = np.random.rand(3, 4).astype(np.float32)
     np_ipt2 = np.random.rand(4, 5).astype(np.float32)
     np_res = np.matmul(np.matmul(np_ipt0, np_ipt1), np_ipt2)

     ipt0 = tvm.runtime.tensor(np_ipt0, devices[0])
     ipt1 = tvm.runtime.tensor(np_ipt1, devices[0])
     ipt2 = tvm.runtime.tensor(np_ipt2, devices[1])
     res = vm["foo"](ipt0, ipt1, ipt2)
     tvm.testing.assert_allclose(res.numpy(), np_res)


 @tvm.testing.requires_multi_gpu
 def test_multi_gpu():
     @I.ir_module
     class Example:
         I.module_attrs({"attr": 10})
         I.module_global_infos(
             {
                 "vdevice": [
                     I.vdevice("cuda", 1),
                     I.vdevice("cuda", 0),
                     I.vdevice("cuda", 2),
                 ]
             }
         )

         @R.function
         def foo(
             a: R.Tensor((2, 3), "float32"),
             b: R.Tensor((3, 4), "float32"),
             c: R.Tensor((4, 5), "float32"),
             d: R.Tensor((5, 6), "float32"),
         ) -> R.Tensor((2, 6), "float32"):
             with R.dataflow():
                 lv0: R.Tensor((2, 4), "float32", "cuda:0") = R.matmul(a, b)
                 lv1: R.Tensor((2, 4), "float32", "cuda:1") = R.to_vdevice(
                     lv0,
                     "cuda:1",
                 )
                 lv2: R.Tensor((2, 5), "float32", "cuda:1") = R.matmul(lv1, c)
                 lv3: R.Tensor((2, 5), "float32", "cuda:2") = R.to_vdevice(
                     lv2,
                     "cuda:2",
                 )
                 gv: R.Tensor((2, 6), "float32", "cuda:2") = R.matmul(lv3, d)
                 R.output(gv)
             return gv

     # The number and ordering of devices should be identical with the vdevice list
     # defined in global_infos of ir_module
     devices = [tvm.cuda(1), tvm.cuda(0), tvm.cuda(2)]
     vm = compile(Example, devices)

     np_ipt0 = np.random.rand(2, 3).astype(np.float32)
     np_ipt1 = np.random.rand(3, 4).astype(np.float32)
     np_ipt2 = np.random.rand(4, 5).astype(np.float32)
     np_ipt3 = np.random.rand(5, 6).astype(np.float32)
     np_res = np.matmul(np.matmul(np.matmul(np_ipt0, np_ipt1), np_ipt2), np_ipt3)

     ipt0 = tvm.runtime.tensor(np_ipt0, devices[0])
     ipt1 = tvm.runtime.tensor(np_ipt1, devices[0])
     ipt2 = tvm.runtime.tensor(np_ipt2, devices[1])
     ipt3 = tvm.runtime.tensor(np_ipt3, devices[2])
     res = vm["foo"](ipt0, ipt1, ipt2, ipt3)
     tvm.testing.assert_allclose(res.numpy(), np_res)


 @tvm.testing.requires_gpu
 def test_multi_device():
     @I.ir_module
     class Example:
         I.module_attrs({"attr": 10})
         I.module_global_infos(
             {
                 "vdevice": [
                     I.vdevice("cuda", 0),
                     I.vdevice("llvm"),
                 ]
             }
         )

         @R.function
         def foo(
             x: R.Tensor((2, 3), "float32"),
             y: R.Tensor((3, 4), "float32"),
             z: R.Tensor((4, 5), "float32"),
         ) -> R.Tensor((2, 5), "float32"):
             with R.dataflow():
                 lv0: R.Tensor((2, 4), "float32", "llvm") = R.matmul(x, y)
                 lv1: R.Tensor((2, 4), "float32", "cuda") = R.to_vdevice(lv0, "cuda")
                 gv: R.Tensor((2, 5), "float32", "cuda") = R.matmul(lv1, z)
                 R.output(gv)
             return gv

     # The number and ordering of devices should be identical with the vdevice list
     # defined in global_infos of ir_module
     devices = [tvm.cuda(0), tvm.cpu(0)]
     vm = compile(Example, devices)

     np_ipt0 = np.random.rand(2, 3).astype(np.float32)
     np_ipt1 = np.random.rand(3, 4).astype(np.float32)
     np_ipt2 = np.random.rand(4, 5).astype(np.float32)
     np_res = np.matmul(np.matmul(np_ipt0, np_ipt1), np_ipt2)

     ipt0 = tvm.runtime.tensor(np_ipt0, devices[1])
     ipt1 = tvm.runtime.tensor(np_ipt1, devices[1])
     ipt2 = tvm.runtime.tensor(np_ipt2, devices[0])
     res = vm["foo"](ipt0, ipt1, ipt2)
     tvm.testing.assert_allclose(res.numpy(), np_res, rtol=1e-4, atol=1e-4)


 if __name__ == "__main__":
     tvm.testing.main()
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	"""Test eliminate common subexpr pass"""

	import numpy as np

	import tvm
	import tvm.testing
	from tvm import relax
	from tvm.ir.module import IRModule
	from tvm.runtime import Device
	from tvm.script.parser import ir as I
	from tvm.script.parser import relax as R


	def compile(
	mod: IRModule,
	device: list[Device] = [
	tvm.cpu(),
	],
	) -> relax.VirtualMachine:
	# compile the model
	mod = relax.transform.RealizeVDevice()(mod)
	mod = relax.transform.LegalizeOps()(mod)
	mod = tvm.s_tir.transform.DefaultGPUSchedule()(mod)
	# no need to feed target argument for mult-target compilation
	ex = tvm.compile(mod)

	return relax.VirtualMachine(ex, device)


	def test_multi_cpu():
	@I.ir_module
	class Example:
	I.module_attrs({"attr": 10})
	I.module_global_infos(
	{
	"vdevice": [
	I.vdevice("llvm", 0),
	I.vdevice("llvm", 1),
	]
	}
	)

	@R.function
	def foo(
	x: R.Tensor((2, 3), "float32"),
	y: R.Tensor((3, 4), "float32"),
	z: R.Tensor((4, 5), "float32"),
	) -> R.Tensor((2, 5), "float32"):
	with R.dataflow():
	lv0 = R.matmul(x, y)
	lv0 = R.hint_on_device(lv0, tvm.cpu(0))
	lv1: R.Tensor((2, 4), "float32", "llvm:1") = R.to_vdevice(lv0, "llvm:1")
	gv = R.matmul(lv1, z)
	R.output(gv)
	return gv

	devices = [tvm.cpu(0), tvm.cpu(1)]
	vm = compile(Example, devices)

	np_ipt0 = np.random.rand(2, 3).astype(np.float32)
	np_ipt1 = np.random.rand(3, 4).astype(np.float32)
	np_ipt2 = np.random.rand(4, 5).astype(np.float32)
	np_res = np.matmul(np.matmul(np_ipt0, np_ipt1), np_ipt2)

	ipt0 = tvm.runtime.tensor(np_ipt0, devices[0])
	ipt1 = tvm.runtime.tensor(np_ipt1, devices[0])
	ipt2 = tvm.runtime.tensor(np_ipt2, devices[1])
	res = vm["foo"](ipt0, ipt1, ipt2)
	tvm.testing.assert_allclose(res.numpy(), np_res)


	@tvm.testing.requires_multi_gpu
	def test_multi_gpu():
	@I.ir_module
	class Example:
	I.module_attrs({"attr": 10})
	I.module_global_infos(
	{
	"vdevice": [
	I.vdevice("cuda", 1),
	I.vdevice("cuda", 0),
	I.vdevice("cuda", 2),
	]
	}
	)

	@R.function
	def foo(
	a: R.Tensor((2, 3), "float32"),
	b: R.Tensor((3, 4), "float32"),
	c: R.Tensor((4, 5), "float32"),
	d: R.Tensor((5, 6), "float32"),
	) -> R.Tensor((2, 6), "float32"):
	with R.dataflow():
	lv0: R.Tensor((2, 4), "float32", "cuda:0") = R.matmul(a, b)
	lv1: R.Tensor((2, 4), "float32", "cuda:1") = R.to_vdevice(
	lv0,
	"cuda:1",
	)
	lv2: R.Tensor((2, 5), "float32", "cuda:1") = R.matmul(lv1, c)
	lv3: R.Tensor((2, 5), "float32", "cuda:2") = R.to_vdevice(
	lv2,
	"cuda:2",
	)
	gv: R.Tensor((2, 6), "float32", "cuda:2") = R.matmul(lv3, d)
	R.output(gv)
	return gv

	# The number and ordering of devices should be identical with the vdevice list
	# defined in global_infos of ir_module
	devices = [tvm.cuda(1), tvm.cuda(0), tvm.cuda(2)]
	vm = compile(Example, devices)

	np_ipt0 = np.random.rand(2, 3).astype(np.float32)
	np_ipt1 = np.random.rand(3, 4).astype(np.float32)
	np_ipt2 = np.random.rand(4, 5).astype(np.float32)
	np_ipt3 = np.random.rand(5, 6).astype(np.float32)
	np_res = np.matmul(np.matmul(np.matmul(np_ipt0, np_ipt1), np_ipt2), np_ipt3)

	ipt0 = tvm.runtime.tensor(np_ipt0, devices[0])
	ipt1 = tvm.runtime.tensor(np_ipt1, devices[0])
	ipt2 = tvm.runtime.tensor(np_ipt2, devices[1])
	ipt3 = tvm.runtime.tensor(np_ipt3, devices[2])
	res = vm["foo"](ipt0, ipt1, ipt2, ipt3)
	tvm.testing.assert_allclose(res.numpy(), np_res)


	@tvm.testing.requires_gpu
	def test_multi_device():
	@I.ir_module
	class Example:
	I.module_attrs({"attr": 10})
	I.module_global_infos(
	{
	"vdevice": [
	I.vdevice("cuda", 0),
	I.vdevice("llvm"),
	]
	}
	)

	@R.function
	def foo(
	x: R.Tensor((2, 3), "float32"),
	y: R.Tensor((3, 4), "float32"),
	z: R.Tensor((4, 5), "float32"),
	) -> R.Tensor((2, 5), "float32"):
	with R.dataflow():
	lv0: R.Tensor((2, 4), "float32", "llvm") = R.matmul(x, y)
	lv1: R.Tensor((2, 4), "float32", "cuda") = R.to_vdevice(lv0, "cuda")
	gv: R.Tensor((2, 5), "float32", "cuda") = R.matmul(lv1, z)
	R.output(gv)
	return gv

	# The number and ordering of devices should be identical with the vdevice list
	# defined in global_infos of ir_module
	devices = [tvm.cuda(0), tvm.cpu(0)]
	vm = compile(Example, devices)

	np_ipt0 = np.random.rand(2, 3).astype(np.float32)
	np_ipt1 = np.random.rand(3, 4).astype(np.float32)
	np_ipt2 = np.random.rand(4, 5).astype(np.float32)
	np_res = np.matmul(np.matmul(np_ipt0, np_ipt1), np_ipt2)

	ipt0 = tvm.runtime.tensor(np_ipt0, devices[1])
	ipt1 = tvm.runtime.tensor(np_ipt1, devices[1])
	ipt2 = tvm.runtime.tensor(np_ipt2, devices[0])
	res = vm["foo"](ipt0, ipt1, ipt2)
	tvm.testing.assert_allclose(res.numpy(), np_res, rtol=1e-4, atol=1e-4)


	if __name__ == "__main__":
	tvm.testing.main()