blob: d065d0078e4d763ebc192e0467f501ec61b84872 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Test eliminate common subexpr pass"""
import numpy as np
import tvm
import tvm.testing
from tvm import relax
from tvm.ir.module import IRModule
from tvm.runtime import Device
from tvm.script.parser import ir as I
from tvm.script.parser import relax as R
def compile(
mod: IRModule,
device: list[Device] = [
tvm.cpu(),
],
) -> relax.VirtualMachine:
# compile the model
mod = relax.transform.RealizeVDevice()(mod)
mod = relax.transform.LegalizeOps()(mod)
mod = tvm.s_tir.transform.DefaultGPUSchedule()(mod)
# no need to feed target argument for mult-target compilation
ex = tvm.compile(mod)
return relax.VirtualMachine(ex, device)
def test_multi_cpu():
@I.ir_module
class Example:
I.module_attrs({"attr": 10})
I.module_global_infos(
{
"vdevice": [
I.vdevice("llvm", 0),
I.vdevice("llvm", 1),
]
}
)
@R.function
def foo(
x: R.Tensor((2, 3), "float32"),
y: R.Tensor((3, 4), "float32"),
z: R.Tensor((4, 5), "float32"),
) -> R.Tensor((2, 5), "float32"):
with R.dataflow():
lv0 = R.matmul(x, y)
lv0 = R.hint_on_device(lv0, tvm.cpu(0))
lv1: R.Tensor((2, 4), "float32", "llvm:1") = R.to_vdevice(lv0, "llvm:1")
gv = R.matmul(lv1, z)
R.output(gv)
return gv
devices = [tvm.cpu(0), tvm.cpu(1)]
vm = compile(Example, devices)
np_ipt0 = np.random.rand(2, 3).astype(np.float32)
np_ipt1 = np.random.rand(3, 4).astype(np.float32)
np_ipt2 = np.random.rand(4, 5).astype(np.float32)
np_res = np.matmul(np.matmul(np_ipt0, np_ipt1), np_ipt2)
ipt0 = tvm.runtime.tensor(np_ipt0, devices[0])
ipt1 = tvm.runtime.tensor(np_ipt1, devices[0])
ipt2 = tvm.runtime.tensor(np_ipt2, devices[1])
res = vm["foo"](ipt0, ipt1, ipt2)
tvm.testing.assert_allclose(res.numpy(), np_res)
@tvm.testing.requires_multi_gpu
def test_multi_gpu():
@I.ir_module
class Example:
I.module_attrs({"attr": 10})
I.module_global_infos(
{
"vdevice": [
I.vdevice("cuda", 1),
I.vdevice("cuda", 0),
I.vdevice("cuda", 2),
]
}
)
@R.function
def foo(
a: R.Tensor((2, 3), "float32"),
b: R.Tensor((3, 4), "float32"),
c: R.Tensor((4, 5), "float32"),
d: R.Tensor((5, 6), "float32"),
) -> R.Tensor((2, 6), "float32"):
with R.dataflow():
lv0: R.Tensor((2, 4), "float32", "cuda:0") = R.matmul(a, b)
lv1: R.Tensor((2, 4), "float32", "cuda:1") = R.to_vdevice(
lv0,
"cuda:1",
)
lv2: R.Tensor((2, 5), "float32", "cuda:1") = R.matmul(lv1, c)
lv3: R.Tensor((2, 5), "float32", "cuda:2") = R.to_vdevice(
lv2,
"cuda:2",
)
gv: R.Tensor((2, 6), "float32", "cuda:2") = R.matmul(lv3, d)
R.output(gv)
return gv
# The number and ordering of devices should be identical with the vdevice list
# defined in global_infos of ir_module
devices = [tvm.cuda(1), tvm.cuda(0), tvm.cuda(2)]
vm = compile(Example, devices)
np_ipt0 = np.random.rand(2, 3).astype(np.float32)
np_ipt1 = np.random.rand(3, 4).astype(np.float32)
np_ipt2 = np.random.rand(4, 5).astype(np.float32)
np_ipt3 = np.random.rand(5, 6).astype(np.float32)
np_res = np.matmul(np.matmul(np.matmul(np_ipt0, np_ipt1), np_ipt2), np_ipt3)
ipt0 = tvm.runtime.tensor(np_ipt0, devices[0])
ipt1 = tvm.runtime.tensor(np_ipt1, devices[0])
ipt2 = tvm.runtime.tensor(np_ipt2, devices[1])
ipt3 = tvm.runtime.tensor(np_ipt3, devices[2])
res = vm["foo"](ipt0, ipt1, ipt2, ipt3)
tvm.testing.assert_allclose(res.numpy(), np_res)
@tvm.testing.requires_gpu
def test_multi_device():
@I.ir_module
class Example:
I.module_attrs({"attr": 10})
I.module_global_infos(
{
"vdevice": [
I.vdevice("cuda", 0),
I.vdevice("llvm"),
]
}
)
@R.function
def foo(
x: R.Tensor((2, 3), "float32"),
y: R.Tensor((3, 4), "float32"),
z: R.Tensor((4, 5), "float32"),
) -> R.Tensor((2, 5), "float32"):
with R.dataflow():
lv0: R.Tensor((2, 4), "float32", "llvm") = R.matmul(x, y)
lv1: R.Tensor((2, 4), "float32", "cuda") = R.to_vdevice(lv0, "cuda")
gv: R.Tensor((2, 5), "float32", "cuda") = R.matmul(lv1, z)
R.output(gv)
return gv
# The number and ordering of devices should be identical with the vdevice list
# defined in global_infos of ir_module
devices = [tvm.cuda(0), tvm.cpu(0)]
vm = compile(Example, devices)
np_ipt0 = np.random.rand(2, 3).astype(np.float32)
np_ipt1 = np.random.rand(3, 4).astype(np.float32)
np_ipt2 = np.random.rand(4, 5).astype(np.float32)
np_res = np.matmul(np.matmul(np_ipt0, np_ipt1), np_ipt2)
ipt0 = tvm.runtime.tensor(np_ipt0, devices[1])
ipt1 = tvm.runtime.tensor(np_ipt1, devices[1])
ipt2 = tvm.runtime.tensor(np_ipt2, devices[0])
res = vm["foo"](ipt0, ipt1, ipt2)
tvm.testing.assert_allclose(res.numpy(), np_res, rtol=1e-4, atol=1e-4)
if __name__ == "__main__":
tvm.testing.main()