blob: 7e7d3f2209d38316c9259002c443777db1511d74 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=invalid-name, too-many-locals, too-many-statements, unused-argument
"""Test code for dense tensorcore operator"""
import numpy as np
import tvm
from tvm import topi
import tvm.topi.testing
from tvm.topi.utils import get_const_tuple
from tvm import te
from tvm.contrib.pickle_memoize import memoize
import tvm.testing
_dense_implement = {"gpu": [(topi.cuda.dense_tensorcore, topi.cuda.schedule_dense_tensorcore)]}
def convert_int32_into_int4(a_int32):
"""convert int32 values into int4
Parameters
----------
a_int32 : int
Return
------
a_int4 : int
"""
K, L = a_int32.shape
assert L % 8 == 0
a_int4 = np.zeros(shape=(K, L // 8), dtype=np.int32)
for k in range(K):
for l in range(L // 8):
for m in range(min(8, L - l * 8)):
a_int4[k, l] = a_int4[k, l] | ((a_int32[k, l * 8 + m] & 0xF) << ((7 - m) * 4))
return a_int4
def convert_int32_into_int4_bias(a_int32):
"""convert int32 values into int4
Parameters
----------
a_int32 : int
Return
------
a_int4 : int
"""
(L,) = a_int32.shape
assert L % 8 == 0
a_int4 = np.zeros(shape=(L // 8), dtype=np.int32)
for l in range(L // 8):
for m in range(min(8, L - l * 8)):
a_int4[l] = a_int4[l] | ((a_int32[l * 8 + m] & 0xF) << ((7 - m) * 4))
return a_int4
def verify_dense(batch, in_dim, out_dim, dtype, use_bias=True):
"""Dense tensorcore verify function"""
A = te.placeholder((batch, in_dim), name="A", dtype=dtype)
B = te.placeholder((out_dim, in_dim), name="B", dtype=dtype)
C = te.placeholder((out_dim,), name="C", dtype=dtype)
assert dtype in ["int4", "int8", "float16"]
out_dtype = "float32"
if dtype in ["int8", "int4"]:
out_dtype = "int32"
# use memoize to pickle the test data for next time use
@memoize("topi.tests.test_topi_dense_tensorcore")
def get_ref_data():
if dtype == "int4":
a_np = np.random.randint(low=-8, high=7, size=(batch, in_dim))
b_np = np.random.randint(low=-8, high=7, size=(out_dim, in_dim))
c_np = np.random.randint(low=-8, high=7, size=(out_dim,))
elif dtype == "int8":
a_np = np.random.randint(low=-128, high=127, size=(batch, in_dim)).astype(dtype)
b_np = np.random.randint(low=-128, high=127, size=(out_dim, in_dim)).astype(dtype)
c_np = np.random.randint(low=-128, high=127, size=(out_dim,)).astype(dtype)
else:
a_np = np.random.uniform(size=(batch, in_dim)).astype(dtype)
b_np = np.random.uniform(size=(out_dim, in_dim)).astype(dtype)
c_np = np.random.uniform(size=(out_dim,)).astype(dtype)
d_np = tvm.topi.testing.dense(a_np, b_np, c_np, use_bias, True, out_dtype)
return (a_np, b_np, c_np, d_np)
# get the test data
a_np, b_np, c_np, d_np = get_ref_data()
if dtype == "int4":
a_np = convert_int32_into_int4(a_np)
b_np = convert_int32_into_int4(b_np)
c_np = convert_int32_into_int4_bias(c_np)
def check_device(device):
dev = tvm.device(device, 0)
print("Running on target: %s" % device)
for fcompute, fschedule in tvm.topi.testing.dispatch(device, _dense_implement):
with tvm.target.Target(device):
D = fcompute(A, B, C if use_bias else None, out_dtype)
D = topi.nn.relu(D)
s = fschedule([D])
a = tvm.nd.array(a_np, dev)
b = tvm.nd.array(b_np, dev)
c = tvm.nd.array(c_np, dev)
d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), dev)
f = tvm.build(s, [A, B, C, D], device, name="dense")
f(a, b, c, d)
tvm.testing.assert_allclose(d.numpy(), d_np, rtol=1e-3)
check_device("cuda")
@tvm.testing.requires_tensorcore
def test_dense_tensorcore():
"""Test cases"""
for dtype in ["float16", "int8"]:
verify_dense(8, 16, 32, "float16", use_bias=True)
verify_dense(16, 32, 16, dtype, use_bias=True)
verify_dense(256, 1024, 1024, dtype, use_bias=True)
verify_dense(1000, 1024, 1024, dtype, use_bias=False)
verify_dense(256, 2048, 1000, dtype, use_bias=False)
# TODO: need fix int4 use_bias=True, wyc-ruiker
verify_dense(16, 32, 16, "int4", use_bias=False)
verify_dense(256, 1024, 1024, "int4", use_bias=False)
verify_dense(1000, 1024, 1024, "int4", use_bias=False)
verify_dense(256, 2048, 1000, "int4", use_bias=False)
if __name__ == "__main__":
test_dense_tensorcore()