tests/python/topi/python/test_topi_dense_tensorcore.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name, too-many-locals, too-many-statements, unused-argument
 """Test code for dense tensorcore operator"""
 import numpy as np
 import tvm
 from tvm import topi
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
 from tvm import te
 from tvm.contrib.pickle_memoize import memoize
 import tvm.testing


 _dense_implement = {"gpu": [(topi.cuda.dense_tensorcore, topi.cuda.schedule_dense_tensorcore)]}


 def convert_int32_into_int4(a_int32):
     """convert int32 values into int4
     Parameters
     ----------
     a_int32 : int

     Return
     ------
     a_int4 : int
     """
     K, L = a_int32.shape
     assert L % 8 == 0
     a_int4 = np.zeros(shape=(K, L // 8), dtype=np.int32)
     for k in range(K):
         for l in range(L // 8):
             for m in range(min(8, L - l * 8)):
                 a_int4[k, l] = a_int4[k, l] | ((a_int32[k, l * 8 + m] & 0xF) << ((7 - m) * 4))
     return a_int4


 def convert_int32_into_int4_bias(a_int32):
     """convert int32 values into int4
     Parameters
     ----------
     a_int32 : int

     Return
     ------
     a_int4 : int
     """
     (L,) = a_int32.shape
     assert L % 8 == 0
     a_int4 = np.zeros(shape=(L // 8), dtype=np.int32)
     for l in range(L // 8):
         for m in range(min(8, L - l * 8)):
             a_int4[l] = a_int4[l] | ((a_int32[l * 8 + m] & 0xF) << ((7 - m) * 4))
     return a_int4


 def verify_dense(batch, in_dim, out_dim, dtype, use_bias=True):
     """Dense tensorcore verify function"""
     A = te.placeholder((batch, in_dim), name="A", dtype=dtype)
     B = te.placeholder((out_dim, in_dim), name="B", dtype=dtype)
     C = te.placeholder((out_dim,), name="C", dtype=dtype)

     assert dtype in ["int4", "int8", "float16"]

     out_dtype = "float32"
     if dtype in ["int8", "int4"]:
         out_dtype = "int32"

     # use memoize to pickle the test data for next time use
     @memoize("topi.tests.test_topi_dense_tensorcore")
     def get_ref_data():
         if dtype == "int4":
             a_np = np.random.randint(low=-8, high=7, size=(batch, in_dim))
             b_np = np.random.randint(low=-8, high=7, size=(out_dim, in_dim))
             c_np = np.random.randint(low=-8, high=7, size=(out_dim,))
         elif dtype == "int8":
             a_np = np.random.randint(low=-128, high=127, size=(batch, in_dim)).astype(dtype)
             b_np = np.random.randint(low=-128, high=127, size=(out_dim, in_dim)).astype(dtype)
             c_np = np.random.randint(low=-128, high=127, size=(out_dim,)).astype(dtype)
         else:
             a_np = np.random.uniform(size=(batch, in_dim)).astype(dtype)
             b_np = np.random.uniform(size=(out_dim, in_dim)).astype(dtype)
             c_np = np.random.uniform(size=(out_dim,)).astype(dtype)
         d_np = tvm.topi.testing.dense(a_np, b_np, c_np, use_bias, True, out_dtype)
         return (a_np, b_np, c_np, d_np)

     # get the test data
     a_np, b_np, c_np, d_np = get_ref_data()
     if dtype == "int4":
         a_np = convert_int32_into_int4(a_np)
         b_np = convert_int32_into_int4(b_np)
         c_np = convert_int32_into_int4_bias(c_np)

     def check_device(device):
         dev = tvm.device(device, 0)
         print("Running on target: %s" % device)
         for fcompute, fschedule in tvm.topi.testing.dispatch(device, _dense_implement):
             with tvm.target.Target(device):
                 D = fcompute(A, B, C if use_bias else None, out_dtype)
                 D = topi.nn.relu(D)
                 s = fschedule([D])
             a = tvm.nd.array(a_np, dev)
             b = tvm.nd.array(b_np, dev)
             c = tvm.nd.array(c_np, dev)
             d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), dev)
             f = tvm.build(s, [A, B, C, D], device, name="dense")
             f(a, b, c, d)
             tvm.testing.assert_allclose(d.numpy(), d_np, rtol=1e-3)

     check_device("cuda")


 @tvm.testing.requires_tensorcore
 def test_dense_tensorcore():
     """Test cases"""
     for dtype in ["float16", "int8"]:
         verify_dense(8, 16, 32, "float16", use_bias=True)
         verify_dense(16, 32, 16, dtype, use_bias=True)
         verify_dense(256, 1024, 1024, dtype, use_bias=True)
         verify_dense(1000, 1024, 1024, dtype, use_bias=False)
         verify_dense(256, 2048, 1000, dtype, use_bias=False)
     # TODO: need fix int4 use_bias=True, wyc-ruiker
     verify_dense(16, 32, 16, "int4", use_bias=False)
     verify_dense(256, 1024, 1024, "int4", use_bias=False)
     verify_dense(1000, 1024, 1024, "int4", use_bias=False)
     verify_dense(256, 2048, 1000, "int4", use_bias=False)


 if __name__ == "__main__":
     test_dense_tensorcore()
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	# pylint: disable=invalid-name, too-many-locals, too-many-statements, unused-argument
	"""Test code for dense tensorcore operator"""
	import numpy as np
	import tvm
	from tvm import topi
	import tvm.topi.testing
	from tvm.topi.utils import get_const_tuple
	from tvm import te
	from tvm.contrib.pickle_memoize import memoize
	import tvm.testing


	_dense_implement = {"gpu": [(topi.cuda.dense_tensorcore, topi.cuda.schedule_dense_tensorcore)]}


	def convert_int32_into_int4(a_int32):
	"""convert int32 values into int4
	Parameters
	----------
	a_int32 : int

	Return
	------
	a_int4 : int
	"""
	K, L = a_int32.shape
	assert L % 8 == 0
	a_int4 = np.zeros(shape=(K, L // 8), dtype=np.int32)
	for k in range(K):
	for l in range(L // 8):
	for m in range(min(8, L - l * 8)):
	a_int4[k, l] = a_int4[k, l] \| ((a_int32[k, l * 8 + m] & 0xF) << ((7 - m) * 4))
	return a_int4


	def convert_int32_into_int4_bias(a_int32):
	"""convert int32 values into int4
	Parameters
	----------
	a_int32 : int

	Return
	------
	a_int4 : int
	"""
	(L,) = a_int32.shape
	assert L % 8 == 0
	a_int4 = np.zeros(shape=(L // 8), dtype=np.int32)
	for l in range(L // 8):
	for m in range(min(8, L - l * 8)):
	a_int4[l] = a_int4[l] \| ((a_int32[l * 8 + m] & 0xF) << ((7 - m) * 4))
	return a_int4


	def verify_dense(batch, in_dim, out_dim, dtype, use_bias=True):
	"""Dense tensorcore verify function"""
	A = te.placeholder((batch, in_dim), name="A", dtype=dtype)
	B = te.placeholder((out_dim, in_dim), name="B", dtype=dtype)
	C = te.placeholder((out_dim,), name="C", dtype=dtype)

	assert dtype in ["int4", "int8", "float16"]

	out_dtype = "float32"
	if dtype in ["int8", "int4"]:
	out_dtype = "int32"

	# use memoize to pickle the test data for next time use
	@memoize("topi.tests.test_topi_dense_tensorcore")
	def get_ref_data():
	if dtype == "int4":
	a_np = np.random.randint(low=-8, high=7, size=(batch, in_dim))
	b_np = np.random.randint(low=-8, high=7, size=(out_dim, in_dim))
	c_np = np.random.randint(low=-8, high=7, size=(out_dim,))
	elif dtype == "int8":
	a_np = np.random.randint(low=-128, high=127, size=(batch, in_dim)).astype(dtype)
	b_np = np.random.randint(low=-128, high=127, size=(out_dim, in_dim)).astype(dtype)
	c_np = np.random.randint(low=-128, high=127, size=(out_dim,)).astype(dtype)
	else:
	a_np = np.random.uniform(size=(batch, in_dim)).astype(dtype)
	b_np = np.random.uniform(size=(out_dim, in_dim)).astype(dtype)
	c_np = np.random.uniform(size=(out_dim,)).astype(dtype)
	d_np = tvm.topi.testing.dense(a_np, b_np, c_np, use_bias, True, out_dtype)
	return (a_np, b_np, c_np, d_np)

	# get the test data
	a_np, b_np, c_np, d_np = get_ref_data()
	if dtype == "int4":
	a_np = convert_int32_into_int4(a_np)
	b_np = convert_int32_into_int4(b_np)
	c_np = convert_int32_into_int4_bias(c_np)

	def check_device(device):
	dev = tvm.device(device, 0)
	print("Running on target: %s" % device)
	for fcompute, fschedule in tvm.topi.testing.dispatch(device, _dense_implement):
	with tvm.target.Target(device):
	D = fcompute(A, B, C if use_bias else None, out_dtype)
	D = topi.nn.relu(D)
	s = fschedule([D])
	a = tvm.nd.array(a_np, dev)
	b = tvm.nd.array(b_np, dev)
	c = tvm.nd.array(c_np, dev)
	d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), dev)
	f = tvm.build(s, [A, B, C, D], device, name="dense")
	f(a, b, c, d)
	tvm.testing.assert_allclose(d.numpy(), d_np, rtol=1e-3)

	check_device("cuda")


	@tvm.testing.requires_tensorcore
	def test_dense_tensorcore():
	"""Test cases"""
	for dtype in ["float16", "int8"]:
	verify_dense(8, 16, 32, "float16", use_bias=True)
	verify_dense(16, 32, 16, dtype, use_bias=True)
	verify_dense(256, 1024, 1024, dtype, use_bias=True)
	verify_dense(1000, 1024, 1024, dtype, use_bias=False)
	verify_dense(256, 2048, 1000, dtype, use_bias=False)
	# TODO: need fix int4 use_bias=True, wyc-ruiker
	verify_dense(16, 32, 16, "int4", use_bias=False)
	verify_dense(256, 1024, 1024, "int4", use_bias=False)
	verify_dense(1000, 1024, 1024, "int4", use_bias=False)
	verify_dense(256, 2048, 1000, "int4", use_bias=False)


	if __name__ == "__main__":
	test_dense_tensorcore()