tests/python/contrib/test_cublas.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 import tvm
 from tvm import te
 import numpy as np
 from tvm.contrib import cublas
 from tvm.contrib import cublaslt
 import tvm.testing


 def verify_matmul_add(in_dtype, out_dtype, rtol=1e-5):
     n = 1024
     l = 128
     m = 236
     A = te.placeholder((n, l), name="A", dtype=in_dtype)
     B = te.placeholder((l, m), name="B", dtype=in_dtype)
     C = cublas.matmul(A, B, dtype=out_dtype)
     s = te.create_schedule(C.op)

     def verify(target="cuda"):
         if not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
             print("skip because extern function is not available")
             return
         ctx = tvm.gpu(0)
         f = tvm.build(s, [A, B, C], target)
         a = tvm.nd.array(np.random.uniform(0, 128, size=(n, l)).astype(A.dtype), ctx)
         b = tvm.nd.array(np.random.uniform(0, 128, size=(l, m)).astype(B.dtype), ctx)
         c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
         f(a, b, c)
         tvm.testing.assert_allclose(
             c.asnumpy(), np.dot(a.asnumpy().astype(C.dtype), b.asnumpy().astype(C.dtype)), rtol=rtol
         )

     verify()


 def roundoff(v, d):
     return int(np.floor((v + d - 1) / d) * d)


 def verify_matmul_add_igemm(in_dtype, out_dtype, rtol=1e-5):
     n = 1024
     l = 1024
     m = 1024
     L = roundoff(l, 32)
     N = roundoff(n, 8)
     N_out = roundoff(n, 32)

     A = te.placeholder((N, L), name="A", dtype=in_dtype)
     B = te.placeholder((m, L), name="B", dtype=in_dtype)
     # C has CUBLASLT_ORDER_COL32 layout, thus a different shape
     C = cublaslt.matmul(A, B, False, True, m, N_out, dtype=out_dtype)
     s = te.create_schedule(C.op)

     def verify(target="cuda"):
         if not tvm.get_global_func("tvm.contrib.cublaslt.matmul", True):
             print("skip because extern function is not available")
             return
         ctx = tvm.gpu(0)
         f = tvm.build(s, [A, B, C], target)
         a_old = np.random.uniform(0, 128, size=(n, l))
         b_old = np.random.uniform(0, 128, size=(l, m))

         # Transform a to become CUBLASLT_ORDER_COL4_4R2_8C layout
         a_new = np.hstack((a_old.astype(A.dtype), np.zeros([n, L - l])))
         a_new = np.vstack((a_new.astype(A.dtype), np.zeros([N - n, L])))
         a_even = np.vsplit(a_new[::2], N / 8)
         a_odd = np.vsplit(a_new[1::2], N / 8)
         a_new = [None] * (len(a_even) + len(a_odd))
         a_new[::2] = a_even
         a_new[1::2] = a_odd
         a_new = np.vstack(a_new)
         a_new = np.vstack(
             np.vstack(np.vstack(np.hsplit(i, 8)).reshape([4, 32]) for i in np.vsplit(j, N / 4))
             for j in np.hsplit(a_new, L / 32)
         )
         a_new = a_new.reshape([N, L])
         # Transform b to become CUBLASLT_ORDER_COL32 layout
         b_new = np.vstack(
             np.hsplit(np.hstack((b_old.T.astype(B.dtype), np.zeros([m, L - l]))), L / 32)
         )
         b_new = b_new.reshape([m, L])

         a = tvm.nd.array(a_new.astype(A.dtype), ctx)
         b = tvm.nd.array(b_new.astype(B.dtype), ctx)
         c = tvm.nd.array(np.zeros((m, N_out), dtype=C.dtype), ctx)
         f(a, b, c)
         # Transform output c from layout CUBLASLT_ORDER_COL32 to row major layout
         c_out = c.asnumpy()
         c_out = c_out.reshape([int(m * N_out / 32), 32])
         c_out = np.hstack(np.vsplit(c_out, int(N_out / 32)))
         c_out = c_out[:, :n]
         c_out = c_out.T
         tvm.testing.assert_allclose(
             c_out, np.dot(a_old.astype(C.dtype), b_old.astype(C.dtype)), rtol=rtol
         )

     verify()


 def verify_batch_matmul(in_dtype, out_dtype, rtol=1e-5):
     j = 16
     n = 1024
     l = 128
     m = 236
     A = te.placeholder((j, n, l), name="A", dtype=in_dtype)
     B = te.placeholder((j, l, m), name="B", dtype=in_dtype)
     C = cublas.batch_matmul(A, B, dtype=out_dtype)
     s = te.create_schedule(C.op)

     def verify(target="cuda"):
         if not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
             print("skip because extern function is not available")
             return
         ctx = tvm.gpu(0)
         f = tvm.build(s, [A, B, C], target)
         a = tvm.nd.array(np.random.uniform(size=(j, n, l)).astype(A.dtype), ctx)
         b = tvm.nd.array(np.random.uniform(size=(j, l, m)).astype(B.dtype), ctx)
         c = tvm.nd.array(np.zeros((j, n, m), dtype=C.dtype), ctx)
         f(a, b, c)
         tvm.testing.assert_allclose(
             c.asnumpy(),
             np.matmul(a.asnumpy().astype(C.dtype), b.asnumpy().astype(C.dtype)).astype(C.dtype),
             rtol=rtol,
         )

     verify()


 @tvm.testing.requires_cuda
 def test_matmul_add():
     verify_matmul_add("float", "float", rtol=1e-3)
     verify_matmul_add("float16", "float")
     verify_matmul_add("float16", "float16", rtol=1e-2)
     verify_matmul_add("int8", "int32")


 @tvm.testing.requires_cuda
 def test_matmul_add_igemm():
     verify_matmul_add_igemm("int8", "int32")


 @tvm.testing.requires_cuda
 def test_batch_matmul():
     verify_batch_matmul("float", "float")
     verify_batch_matmul("float16", "float")
     verify_batch_matmul("float16", "float16", rtol=1e-2)


 if __name__ == "__main__":
     test_matmul_add()
     test_batch_matmul()
     test_matmul_add_igemm()
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	import tvm
	from tvm import te
	import numpy as np
	from tvm.contrib import cublas
	from tvm.contrib import cublaslt
	import tvm.testing


	def verify_matmul_add(in_dtype, out_dtype, rtol=1e-5):
	n = 1024
	l = 128
	m = 236
	A = te.placeholder((n, l), name="A", dtype=in_dtype)
	B = te.placeholder((l, m), name="B", dtype=in_dtype)
	C = cublas.matmul(A, B, dtype=out_dtype)
	s = te.create_schedule(C.op)

	def verify(target="cuda"):
	if not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
	print("skip because extern function is not available")
	return
	ctx = tvm.gpu(0)
	f = tvm.build(s, [A, B, C], target)
	a = tvm.nd.array(np.random.uniform(0, 128, size=(n, l)).astype(A.dtype), ctx)
	b = tvm.nd.array(np.random.uniform(0, 128, size=(l, m)).astype(B.dtype), ctx)
	c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
	f(a, b, c)
	tvm.testing.assert_allclose(
	c.asnumpy(), np.dot(a.asnumpy().astype(C.dtype), b.asnumpy().astype(C.dtype)), rtol=rtol
	)

	verify()


	def roundoff(v, d):
	return int(np.floor((v + d - 1) / d) * d)


	def verify_matmul_add_igemm(in_dtype, out_dtype, rtol=1e-5):
	n = 1024
	l = 1024
	m = 1024
	L = roundoff(l, 32)
	N = roundoff(n, 8)
	N_out = roundoff(n, 32)

	A = te.placeholder((N, L), name="A", dtype=in_dtype)
	B = te.placeholder((m, L), name="B", dtype=in_dtype)
	# C has CUBLASLT_ORDER_COL32 layout, thus a different shape
	C = cublaslt.matmul(A, B, False, True, m, N_out, dtype=out_dtype)
	s = te.create_schedule(C.op)

	def verify(target="cuda"):
	if not tvm.get_global_func("tvm.contrib.cublaslt.matmul", True):
	print("skip because extern function is not available")
	return
	ctx = tvm.gpu(0)
	f = tvm.build(s, [A, B, C], target)
	a_old = np.random.uniform(0, 128, size=(n, l))
	b_old = np.random.uniform(0, 128, size=(l, m))

	# Transform a to become CUBLASLT_ORDER_COL4_4R2_8C layout
	a_new = np.hstack((a_old.astype(A.dtype), np.zeros([n, L - l])))
	a_new = np.vstack((a_new.astype(A.dtype), np.zeros([N - n, L])))
	a_even = np.vsplit(a_new[::2], N / 8)
	a_odd = np.vsplit(a_new[1::2], N / 8)
	a_new = [None] * (len(a_even) + len(a_odd))
	a_new[::2] = a_even
	a_new[1::2] = a_odd
	a_new = np.vstack(a_new)
	a_new = np.vstack(
	np.vstack(np.vstack(np.hsplit(i, 8)).reshape([4, 32]) for i in np.vsplit(j, N / 4))
	for j in np.hsplit(a_new, L / 32)
	)
	a_new = a_new.reshape([N, L])
	# Transform b to become CUBLASLT_ORDER_COL32 layout
	b_new = np.vstack(
	np.hsplit(np.hstack((b_old.T.astype(B.dtype), np.zeros([m, L - l]))), L / 32)
	)
	b_new = b_new.reshape([m, L])

	a = tvm.nd.array(a_new.astype(A.dtype), ctx)
	b = tvm.nd.array(b_new.astype(B.dtype), ctx)
	c = tvm.nd.array(np.zeros((m, N_out), dtype=C.dtype), ctx)
	f(a, b, c)
	# Transform output c from layout CUBLASLT_ORDER_COL32 to row major layout
	c_out = c.asnumpy()
	c_out = c_out.reshape([int(m * N_out / 32), 32])
	c_out = np.hstack(np.vsplit(c_out, int(N_out / 32)))
	c_out = c_out[:, :n]
	c_out = c_out.T
	tvm.testing.assert_allclose(
	c_out, np.dot(a_old.astype(C.dtype), b_old.astype(C.dtype)), rtol=rtol
	)

	verify()


	def verify_batch_matmul(in_dtype, out_dtype, rtol=1e-5):
	j = 16
	n = 1024
	l = 128
	m = 236
	A = te.placeholder((j, n, l), name="A", dtype=in_dtype)
	B = te.placeholder((j, l, m), name="B", dtype=in_dtype)
	C = cublas.batch_matmul(A, B, dtype=out_dtype)
	s = te.create_schedule(C.op)

	def verify(target="cuda"):
	if not tvm.get_global_func("tvm.contrib.cublas.matmul", True):
	print("skip because extern function is not available")
	return
	ctx = tvm.gpu(0)
	f = tvm.build(s, [A, B, C], target)
	a = tvm.nd.array(np.random.uniform(size=(j, n, l)).astype(A.dtype), ctx)
	b = tvm.nd.array(np.random.uniform(size=(j, l, m)).astype(B.dtype), ctx)
	c = tvm.nd.array(np.zeros((j, n, m), dtype=C.dtype), ctx)
	f(a, b, c)
	tvm.testing.assert_allclose(
	c.asnumpy(),
	np.matmul(a.asnumpy().astype(C.dtype), b.asnumpy().astype(C.dtype)).astype(C.dtype),
	rtol=rtol,
	)

	verify()


	@tvm.testing.requires_cuda
	def test_matmul_add():
	verify_matmul_add("float", "float", rtol=1e-3)
	verify_matmul_add("float16", "float")
	verify_matmul_add("float16", "float16", rtol=1e-2)
	verify_matmul_add("int8", "int32")


	@tvm.testing.requires_cuda
	def test_matmul_add_igemm():
	verify_matmul_add_igemm("int8", "int32")


	@tvm.testing.requires_cuda
	def test_batch_matmul():
	verify_batch_matmul("float", "float")
	verify_batch_matmul("float16", "float")
	verify_batch_matmul("float16", "float16", rtol=1e-2)


	if __name__ == "__main__":
	test_matmul_add()
	test_batch_matmul()
	test_matmul_add_igemm()