tests/python/contrib/test_hexagon/test_parallel_hvx.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 """
 Test parallelizing HVX workloads and compare them to single thread examples.
 """
 import numpy as np

 import tvm
 from tvm.script import tir as T

 from .infrastructure import get_hexagon_target

 TEST_OUTPUT_TEMPLATE = (
     "Test {} with {} operations... \n"
     "    -Single Thread: {} ms \n"
     "    -Parallel: {} ms\n    -Speedup: {}x\n"
 )


 def get_vrmpy_shape_dtypes(operations):
     return ((operations, 128), "uint8", (operations, 128), "uint8", (operations, 32), "int32")


 def get_vmpy_vadd_shape_dtype(operations):
     return ((operations, 128), "uint8", (operations, 128), "uint8", (operations, 128), "int16")


 def vmpy_expected_producer(shape, a, b):
     expected = np.zeros(shape, dtype="int16")
     for n in range(shape[0]):
         for i in range(0, 128, 2):
             expected[n, i // 2] = np.int16(a[n, i]) * np.int16(b[n, i])
         for i in range(1, 128, 2):
             expected[n, i // 2 + 64] = np.int16(a[n, i]) * np.int16(b[n, i])
     return expected


 def vadd_expected_producer(shape, a, b):
     expected = np.zeros(shape, dtype="int16")
     for n in range(shape[0]):
         for i in range(0, 128, 2):
             expected[n, i // 2] = np.int16(a[n, i]) + np.int16(b[n, i])
         for i in range(1, 128, 2):
             expected[n, i // 2 + 64] = np.int16(a[n, i]) + np.int16(b[n, i])
     return expected


 def vrmpy_expected_producer(shape, a, b):
     expected = np.zeros(shape, dtype="int32")
     for n in range(shape[0]):
         for i in range(32):
             for r_ind in range(4):
                 expected[n, i] = expected[n, i] + np.uint32(a[n, i * 4 + r_ind]) * np.uint32(
                     b[n, i * 4 + r_ind]
                 )
     return expected


 def get_vmpy_operator(operations):
     """Generate vector multiply operator"""

     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8")
         b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8")
         c_buffer = T.match_buffer(c, [operations, 128], dtype="int16")
         for n in T.grid(operations):
             with T.block("c_buffer"):
                 vn_ind = T.axis.remap("S", [n])
                 c_buffer[vn_ind, T.ramp(0, 1, 128)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vmpybusv.128B"),
                     T.reinterpret(a_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
                     T.reinterpret(b_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
                     dtype="int16x128",
                 )

     return operator


 def get_vadd_operator(operations):
     """Generate vadd operator."""

     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8")
         b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8")
         c_buffer = T.match_buffer(c, [operations, 128], dtype="int16")
         for n in T.grid(operations):
             with T.block("c_buffer"):
                 vn_ind = T.axis.remap("S", [n])
                 c_buffer[vn_ind, T.ramp(0, 1, 128)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vaddubh.128B"),
                     T.reinterpret(a_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
                     T.reinterpret(b_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
                     dtype="int16x128",
                 )

     return operator


 def get_vrmpy_operator(operations):
     """Generate vrmpy operator."""

     @T.prim_func
     def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8")
         b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8")
         c_buffer = T.match_buffer(c, [operations, 32], dtype="int32")
         for n in T.grid(operations):
             with T.block("c_buffer"):
                 vn_ind = T.axis.remap("S", [n])
                 c_buffer[vn_ind, T.ramp(0, 1, 32)] = T.call_llvm_intrin(
                     T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
                     T.reinterpret(a_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
                     T.reinterpret(b_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
                     dtype="int32x32",
                 )

     return operator


 def evaluate(hexagon_session, shape_dtypes, expected_output_producer, sch):
     """Evaluate schedule."""
     a_shape, a_dtype, b_shape, b_dtype, c_shape, c_dtype = shape_dtypes

     func_tir = tvm.compile(sch.mod["main"], target=get_hexagon_target("v68"))
     module = hexagon_session.load_module(func_tir)

     a = np.random.randint(0, 16, a_shape, dtype=a_dtype)
     b = np.random.randint(0, 16, b_shape, dtype=b_dtype)
     c = np.zeros(c_shape, dtype=c_dtype)

     a_hexagon = tvm.runtime.tensor(a, device=hexagon_session.device)
     b_hexagon = tvm.runtime.tensor(b, device=hexagon_session.device)
     c_hexagon = tvm.runtime.tensor(c, device=hexagon_session.device)

     # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise.
     number = 1
     repeat = 1

     timer = module.time_evaluator("main", hexagon_session.device, number=number, repeat=repeat)
     runtime = timer(a_hexagon, b_hexagon, c_hexagon)
     tvm.testing.assert_allclose(c_hexagon.numpy(), expected_output_producer(c_shape, a, b))

     return round(runtime.mean * 1000, 6)


 class TestMatMulVec:
     """MatMul test class."""

     (
         operation_name,
         operator_producer,
         shape_dtypes_producer,
         expected_output_producer,
     ) = tvm.testing.parameters(
         ("vrmpy", get_vrmpy_operator, get_vrmpy_shape_dtypes, vrmpy_expected_producer),
         ("vmpy", get_vmpy_operator, get_vmpy_vadd_shape_dtype, vmpy_expected_producer),
         ("vadd", get_vadd_operator, get_vmpy_vadd_shape_dtype, vadd_expected_producer),
     )

     # Experimentally best split factor but all multiples of 4 perform pretty well.
     # This is because there are 4 HVX untis available on the device and pipelining
     # works best with parallels of the number of available HVX.
     split_factor = tvm.testing.parameter(4)

     # Removed most of these to speedup CI.
     operation_count = tvm.testing.parameter(
         128,
         # 256,
         # 512,
         # Single thread runs faster since L2 cache can handle the entire request quickly
         # 1024,
         # 2048,
         # Significant performance degredation once the inputs and outputs cannot all fit in L2
         # 4096,
         # 8192,
         # 16384,
     )

     @tvm.testing.requires_hexagon
     def test(
         self,
         hexagon_session,
         operation_count,
         operation_name,
         operator_producer,
         shape_dtypes_producer,
         expected_output_producer,
         split_factor,
     ):
         """Test function handler."""

         sch = tvm.tir.Schedule(operator_producer(operation_count))
         single_thread_runtime = evaluate(
             hexagon_session, shape_dtypes_producer(operation_count), expected_output_producer, sch
         )

         sch = tvm.tir.Schedule(operator_producer(operation_count))
         block = sch.get_block("c_buffer")
         b = sch.get_loops(block)
         b_output, _ = sch.split(b[0], factors=[split_factor, None])
         sch.parallel(b_output)

         parallel_runtime = evaluate(
             hexagon_session, shape_dtypes_producer(operation_count), expected_output_producer, sch
         )

         speedup = round(single_thread_runtime / parallel_runtime, 2)

         print(
             TEST_OUTPUT_TEMPLATE.format(
                 operation_name, operation_count, single_thread_runtime, parallel_runtime, speedup
             )
         )


 if __name__ == "__main__":
     tvm.testing.main()
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	"""
	Test parallelizing HVX workloads and compare them to single thread examples.
	"""
	import numpy as np

	import tvm
	from tvm.script import tir as T

	from .infrastructure import get_hexagon_target

	TEST_OUTPUT_TEMPLATE = (
	"Test {} with {} operations... \n"
	" -Single Thread: {} ms \n"
	" -Parallel: {} ms\n -Speedup: {}x\n"
	)


	def get_vrmpy_shape_dtypes(operations):
	return ((operations, 128), "uint8", (operations, 128), "uint8", (operations, 32), "int32")


	def get_vmpy_vadd_shape_dtype(operations):
	return ((operations, 128), "uint8", (operations, 128), "uint8", (operations, 128), "int16")


	def vmpy_expected_producer(shape, a, b):
	expected = np.zeros(shape, dtype="int16")
	for n in range(shape[0]):
	for i in range(0, 128, 2):
	expected[n, i // 2] = np.int16(a[n, i]) * np.int16(b[n, i])
	for i in range(1, 128, 2):
	expected[n, i // 2 + 64] = np.int16(a[n, i]) * np.int16(b[n, i])
	return expected


	def vadd_expected_producer(shape, a, b):
	expected = np.zeros(shape, dtype="int16")
	for n in range(shape[0]):
	for i in range(0, 128, 2):
	expected[n, i // 2] = np.int16(a[n, i]) + np.int16(b[n, i])
	for i in range(1, 128, 2):
	expected[n, i // 2 + 64] = np.int16(a[n, i]) + np.int16(b[n, i])
	return expected


	def vrmpy_expected_producer(shape, a, b):
	expected = np.zeros(shape, dtype="int32")
	for n in range(shape[0]):
	for i in range(32):
	for r_ind in range(4):
	expected[n, i] = expected[n, i] + np.uint32(a[n, i * 4 + r_ind]) * np.uint32(
	b[n, i * 4 + r_ind]
	)
	return expected


	def get_vmpy_operator(operations):
	"""Generate vector multiply operator"""

	@T.prim_func
	def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
	T.func_attr({"global_symbol": "main", "tir.noalias": True})
	a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8")
	b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8")
	c_buffer = T.match_buffer(c, [operations, 128], dtype="int16")
	for n in T.grid(operations):
	with T.block("c_buffer"):
	vn_ind = T.axis.remap("S", [n])
	c_buffer[vn_ind, T.ramp(0, 1, 128)] = T.call_llvm_intrin(
	T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vmpybusv.128B"),
	T.reinterpret(a_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
	T.reinterpret(b_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
	dtype="int16x128",
	)

	return operator


	def get_vadd_operator(operations):
	"""Generate vadd operator."""

	@T.prim_func
	def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
	T.func_attr({"global_symbol": "main", "tir.noalias": True})
	a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8")
	b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8")
	c_buffer = T.match_buffer(c, [operations, 128], dtype="int16")
	for n in T.grid(operations):
	with T.block("c_buffer"):
	vn_ind = T.axis.remap("S", [n])
	c_buffer[vn_ind, T.ramp(0, 1, 128)] = T.call_llvm_intrin(
	T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vaddubh.128B"),
	T.reinterpret(a_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
	T.reinterpret(b_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
	dtype="int16x128",
	)

	return operator


	def get_vrmpy_operator(operations):
	"""Generate vrmpy operator."""

	@T.prim_func
	def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
	T.func_attr({"global_symbol": "main", "tir.noalias": True})
	a_buffer = T.match_buffer(a, [operations, 128], dtype="uint8")
	b_buffer = T.match_buffer(b, [operations, 128], dtype="uint8")
	c_buffer = T.match_buffer(c, [operations, 32], dtype="int32")
	for n in T.grid(operations):
	with T.block("c_buffer"):
	vn_ind = T.axis.remap("S", [n])
	c_buffer[vn_ind, T.ramp(0, 1, 32)] = T.call_llvm_intrin(
	T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
	T.reinterpret(a_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
	T.reinterpret(b_buffer[vn_ind, T.ramp(0, 1, 128)], dtype="int32x32"),
	dtype="int32x32",
	)

	return operator


	def evaluate(hexagon_session, shape_dtypes, expected_output_producer, sch):
	"""Evaluate schedule."""
	a_shape, a_dtype, b_shape, b_dtype, c_shape, c_dtype = shape_dtypes

	func_tir = tvm.compile(sch.mod["main"], target=get_hexagon_target("v68"))
	module = hexagon_session.load_module(func_tir)

	a = np.random.randint(0, 16, a_shape, dtype=a_dtype)
	b = np.random.randint(0, 16, b_shape, dtype=b_dtype)
	c = np.zeros(c_shape, dtype=c_dtype)

	a_hexagon = tvm.runtime.tensor(a, device=hexagon_session.device)
	b_hexagon = tvm.runtime.tensor(b, device=hexagon_session.device)
	c_hexagon = tvm.runtime.tensor(c, device=hexagon_session.device)

	# These are reduced for CI but number=100 and repeat=10 does a good job of removing noise.
	number = 1
	repeat = 1

	timer = module.time_evaluator("main", hexagon_session.device, number=number, repeat=repeat)
	runtime = timer(a_hexagon, b_hexagon, c_hexagon)
	tvm.testing.assert_allclose(c_hexagon.numpy(), expected_output_producer(c_shape, a, b))

	return round(runtime.mean * 1000, 6)


	class TestMatMulVec:
	"""MatMul test class."""

	(
	operation_name,
	operator_producer,
	shape_dtypes_producer,
	expected_output_producer,
	) = tvm.testing.parameters(
	("vrmpy", get_vrmpy_operator, get_vrmpy_shape_dtypes, vrmpy_expected_producer),
	("vmpy", get_vmpy_operator, get_vmpy_vadd_shape_dtype, vmpy_expected_producer),
	("vadd", get_vadd_operator, get_vmpy_vadd_shape_dtype, vadd_expected_producer),
	)

	# Experimentally best split factor but all multiples of 4 perform pretty well.
	# This is because there are 4 HVX untis available on the device and pipelining
	# works best with parallels of the number of available HVX.
	split_factor = tvm.testing.parameter(4)

	# Removed most of these to speedup CI.
	operation_count = tvm.testing.parameter(
	128,
	# 256,
	# 512,
	# Single thread runs faster since L2 cache can handle the entire request quickly
	# 1024,
	# 2048,
	# Significant performance degredation once the inputs and outputs cannot all fit in L2
	# 4096,
	# 8192,
	# 16384,
	)

	@tvm.testing.requires_hexagon
	def test(
	self,
	hexagon_session,
	operation_count,
	operation_name,
	operator_producer,
	shape_dtypes_producer,
	expected_output_producer,
	split_factor,
	):
	"""Test function handler."""

	sch = tvm.tir.Schedule(operator_producer(operation_count))
	single_thread_runtime = evaluate(
	hexagon_session, shape_dtypes_producer(operation_count), expected_output_producer, sch
	)

	sch = tvm.tir.Schedule(operator_producer(operation_count))
	block = sch.get_block("c_buffer")
	b = sch.get_loops(block)
	b_output, _ = sch.split(b[0], factors=[split_factor, None])
	sch.parallel(b_output)

	parallel_runtime = evaluate(
	hexagon_session, shape_dtypes_producer(operation_count), expected_output_producer, sch
	)

	speedup = round(single_thread_runtime / parallel_runtime, 2)

	print(
	TEST_OUTPUT_TEMPLATE.format(
	operation_name, operation_count, single_thread_runtime, parallel_runtime, speedup
	)
	)


	if __name__ == "__main__":
	tvm.testing.main()