blob: 018eb7bc3cc68a018ba9d4e6399a687bd1c90d55 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Test eliminate common subexpr pass"""
from typing import List
import tvm
from tvm import relax
import tvm.testing
from tvm.ir.module import IRModule
from tvm.script.parser import ir as I, relax as R
from tvm.runtime import Device
import numpy as np
def compile(
mod: IRModule,
device: List[Device] = [
tvm.cpu(),
],
) -> relax.VirtualMachine:
# compile the model
mod = relax.transform.RealizeVDevice()(mod)
mod = relax.transform.LegalizeOps()(mod)
mod = tvm.tir.transform.DefaultGPUSchedule()(mod)
# no need to feed target argument for mult-target compilation
ex = tvm.compile(mod)
return relax.VirtualMachine(ex, device)
def test_multi_cpu():
@I.ir_module
class Example:
I.module_attrs({"attr": 10})
I.module_global_infos(
{
"vdevice": [
I.vdevice("llvm", 0),
I.vdevice("llvm", 1),
]
}
)
@R.function
def foo(
x: R.Tensor((2, 3), "float32"),
y: R.Tensor((3, 4), "float32"),
z: R.Tensor((4, 5), "float32"),
) -> R.Tensor((2, 5), "float32"):
with R.dataflow():
lv0 = R.matmul(x, y)
lv0 = R.hint_on_device(lv0, tvm.cpu(0))
lv1: R.Tensor((2, 4), "float32", "llvm:1") = R.to_vdevice( # noqa: F722
lv0, "llvm:1"
)
gv = R.matmul(lv1, z)
R.output(gv)
return gv
devices = [tvm.cpu(0), tvm.cpu(1)]
vm = compile(Example, devices)
np_ipt0 = np.random.rand(2, 3).astype(np.float32)
np_ipt1 = np.random.rand(3, 4).astype(np.float32)
np_ipt2 = np.random.rand(4, 5).astype(np.float32)
np_res = np.matmul(np.matmul(np_ipt0, np_ipt1), np_ipt2)
ipt0 = tvm.runtime.tensor(np_ipt0, devices[0])
ipt1 = tvm.runtime.tensor(np_ipt1, devices[0])
ipt2 = tvm.runtime.tensor(np_ipt2, devices[1])
res = vm["foo"](ipt0, ipt1, ipt2)
tvm.testing.assert_allclose(res.numpy(), np_res)
@tvm.testing.requires_multi_gpu
def test_multi_gpu():
@I.ir_module
class Example:
I.module_attrs({"attr": 10})
I.module_global_infos(
{
"vdevice": [
I.vdevice("cuda", 1),
I.vdevice("cuda", 0),
I.vdevice("cuda", 2),
]
}
)
@R.function
def foo(
a: R.Tensor((2, 3), "float32"),
b: R.Tensor((3, 4), "float32"),
c: R.Tensor((4, 5), "float32"),
d: R.Tensor((5, 6), "float32"),
) -> R.Tensor((2, 6), "float32"):
with R.dataflow():
lv0: R.Tensor((2, 4), "float32", "cuda:0") = R.matmul(a, b) # noqa: F722
lv1: R.Tensor((2, 4), "float32", "cuda:1") = R.to_vdevice( # noqa: F722
lv0,
"cuda:1", # noqa: F722
)
lv2: R.Tensor((2, 5), "float32", "cuda:1") = R.matmul(lv1, c) # noqa: F722
lv3: R.Tensor((2, 5), "float32", "cuda:2") = R.to_vdevice( # noqa: F722
lv2,
"cuda:2", # noqa: F722
)
gv: R.Tensor((2, 6), "float32", "cuda:2") = R.matmul(lv3, d) # noqa: F722
R.output(gv)
return gv
# The number and ordering of devices should be identical with the vdevice list
# defined in global_infos of ir_module
devices = [tvm.cuda(1), tvm.cuda(0), tvm.cuda(2)]
vm = compile(Example, devices)
np_ipt0 = np.random.rand(2, 3).astype(np.float32)
np_ipt1 = np.random.rand(3, 4).astype(np.float32)
np_ipt2 = np.random.rand(4, 5).astype(np.float32)
np_ipt3 = np.random.rand(5, 6).astype(np.float32)
np_res = np.matmul(np.matmul(np.matmul(np_ipt0, np_ipt1), np_ipt2), np_ipt3)
ipt0 = tvm.runtime.tensor(np_ipt0, devices[0])
ipt1 = tvm.runtime.tensor(np_ipt1, devices[0])
ipt2 = tvm.runtime.tensor(np_ipt2, devices[1])
ipt3 = tvm.runtime.tensor(np_ipt3, devices[2])
res = vm["foo"](ipt0, ipt1, ipt2, ipt3)
tvm.testing.assert_allclose(res.numpy(), np_res)
@tvm.testing.requires_gpu
def test_multi_device():
@I.ir_module
class Example:
I.module_attrs({"attr": 10})
I.module_global_infos(
{
"vdevice": [
I.vdevice("cuda", 0),
I.vdevice("llvm"),
]
}
)
@R.function
def foo(
x: R.Tensor((2, 3), "float32"),
y: R.Tensor((3, 4), "float32"),
z: R.Tensor((4, 5), "float32"),
) -> R.Tensor((2, 5), "float32"):
with R.dataflow():
lv0: R.Tensor((2, 4), "float32", "llvm") = R.matmul(x, y)
lv1: R.Tensor((2, 4), "float32", "cuda") = R.to_vdevice(lv0, "cuda")
gv: R.Tensor((2, 5), "float32", "cuda") = R.matmul(lv1, z)
R.output(gv)
return gv
# The number and ordering of devices should be identical with the vdevice list
# defined in global_infos of ir_module
devices = [tvm.cuda(0), tvm.cpu(0)]
vm = compile(Example, devices)
np_ipt0 = np.random.rand(2, 3).astype(np.float32)
np_ipt1 = np.random.rand(3, 4).astype(np.float32)
np_ipt2 = np.random.rand(4, 5).astype(np.float32)
np_res = np.matmul(np.matmul(np_ipt0, np_ipt1), np_ipt2)
ipt0 = tvm.runtime.tensor(np_ipt0, devices[1])
ipt1 = tvm.runtime.tensor(np_ipt1, devices[1])
ipt2 = tvm.runtime.tensor(np_ipt2, devices[0])
res = vm["foo"](ipt0, ipt1, ipt2)
tvm.testing.assert_allclose(res.numpy(), np_res, rtol=1e-4, atol=1e-4)
if __name__ == "__main__":
tvm.testing.main()