tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 """ benchmark_elemwise_add """

 import os
 import os.path
 import sys
 import tempfile

 import numpy as np
 import pytest

 import tvm.script
 import tvm.testing
 from tvm.contrib.hexagon.session import Session
 from tvm.script import tir as T

 from . import benchmark_util as bu
 from .infrastructure import get_hexagon_target

 _SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_benchmarks_flag_and_reason()

 # This is a fixed detail of the v68 architecture.
 HVX_VECTOR_BYTES = 128

 # NOTE on server ports:
 # These tests use different port numbers for the RPC server (7070 + ...).
 # The reason is that an RPC session cannot be gracefully closed without
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.

 _BT = bu.BenchmarksTable()

 _CSV_COLUMN_ORDER = [
     # Identifies which TE-compute / TIRScript is used as the basis for the
     # benchmarked primfunc. Only needs to be meaningful to humans.
     "basic_kernel",
     # The tensors' element type
     "dtype",
     # When applicable, indicates the particular variation of schedules
     # apply by the Python code. Decoding this may require looking at this
     # script's source code.
     "sched_type",
     # The memory location of the tensors used during the execution of
     # the primfunc.  We currently assume just one location.
     # This will likely need to be generalized as we add more sophisticated
     # primfuncs.
     "mem_scope",
     # For primfuncs that treat tensor buffers as collections of 1D vectors,
     # this is the number of vectors in each tensor.
     # This will likely need to be generalized as we add more sophisticated
     # primfuncs.
     "num_vectors_per_tensor",
     # Reserved columns defined by the BenchmarksTable class.
     "row_status",
     "timings_min_usecs",
     "timings_max_usecs",
     "timings_median_usecs",
     "timings_mean_usecs",
     "timings_stddev_usecs",
     # For benchmarks that produce files on the host file system, this indicates
     # their location. Useful for post-mortem investigation of benchmark results.
     "host_files_dir_path",
     # Miscellaneous comments about the benchmark.
     "comments",
 ]

 _HOST_OUTPUT_DIR = tempfile.mkdtemp()

 _PRIMFUNC_NAME = "elemwise_add"

 print("-" * 80)
 print("OUTPUT DIRECTORY: {}".format(_HOST_OUTPUT_DIR))
 print("-" * 80)
 print()


 def _get_irmod_elemwise_add(shape: list, dtype: str, mem_scope: str) -> tvm.ir.module.IRModule:
     """
     Return an IRModule containing a single primfunc, expressed as NS-TIR.

     The primfunc implements elementwise-add. Its signature is (A,B,C), where
     A and B are the input tensors, and C is the output tensor.
     All three tensors have the specfied shape, dtype, and mem_scope.

     If the specified primfunc is known to be unsupported, raise an UnsupportedExcetion.
     """
     assert len(shape) == 2

     # TVMScript can reference simple Python variables, but it doesn't
     # curently support more complex Python expressions...
     (
         dim0_size,
         dim1_size,
     ) = shape

     if mem_scope == "global.vtcm":
         raise bu.UnsupportedException("This benchmark kernel does not yet support VTCM buffers.")

         # This check is currently elided by the one above, but it should become relevant as soon
         # as we add VTCM support to this kernel generator.
         #
         # Also: The VTCM budget is a very rough estimate, based only on experience.
         # Assuming that it's even reasonable to use a hard-coded estimate AT ALL, this number
         # may need tweaking.

         # The below code is commented is commented to avoid unreachable error
         # with pylint. Please enable this once the kernel starts supporting
         # VTCM buffers

         # Code starts below:
         # ---- ------ -----
         # estimated_vtcm_budget_bytes = HVX_VECTOR_BYTES * 1024

         # dtype_bits = tvm.runtime.DataType(dtype).bits
         # assert dtype_bits % 8 == 0
         # dtype_bytes = dtype_bits // 8

         # num_vtcm_tensors = 3
         # estimated_vtcm_needed_bytes = shape[0] * shape[1] * dtype_bytes * num_vtcm_tensors

         # if estimated_vtcm_needed_bytes > estimated_vtcm_budget_bytes:
         #     raise bu.UnsupportedException("Expect to exceed VTCM budget.")

     @tvm.script.ir_module
     class BenchmarkModule:
         """Elementwise STIR module for benchmarking"""

         # pylint: disable=no-self-argument,invalid-name,missing-function-docstring
         @T.prim_func
         def main(a: T.handle, b: T.handle, c: T.handle):
             # We exchange data between function by handles, which are similar to pointer.
             T.func_attr({"global_symbol": "main", "tir.noalias": True})

             A = T.match_buffer(a, shape, dtype=dtype)
             B = T.match_buffer(b, shape, dtype=dtype)
             C = T.match_buffer(c, shape, dtype=dtype)

             for i in range(dim0_size):
                 for j in range(dim1_size):
                     C[i, j] = A[i, j] + B[i, j]

         # pylint: enable=no-self-argument,invalid-name,missing-function-docstring

     return BenchmarkModule


 def _benchmark_hexagon_elementwise_add_kernel(
     hexagon_session: Session, shape: list, dtype: str, mem_scope: str
 ):
     """
     Generate and benchmark a single elementwise-add kernel for Hexagon.

     Produce these outputs:
       - Printed status updates / results to stdout and/or stderr.

       - Create a new subdirectory under _HOST_OUTPUT_DIR, and populate it with
         various logs and intermediate files.

       - Add to _BT a row describing this benchmark run.
     """
     # Represent the benchmark details in a form required by the benchmark table
     # and for other logging...
     keys_dict = {
         "basic_kernel": "ewise-add",
         "dtype": dtype,
         "shape": shape,
         "mem_scope": mem_scope,
     }

     desc = bu.get_benchmark_decription(keys_dict)

     # Create the host-side directory for this benchmark run's files / logs...
     host_files_dir_name = bu.get_benchmark_id(keys_dict)
     host_files_dir_path = os.path.join(_HOST_OUTPUT_DIR, host_files_dir_name)
     os.mkdir(host_files_dir_path)

     keys_dict["host_files_dir_path"] = host_files_dir_path

     log_file_path = os.path.join(host_files_dir_path, "out.txt")
     with open(log_file_path, "w", encoding="UTF-8") as log_file:
         print(f"CONFIGURATION: {desc}")
         log_file.write(f"CONFIGURATION: {desc}\n")

         try:
             ns_tir_module = _get_irmod_elemwise_add(shape, dtype, mem_scope)

             # Lower the primfunc's IRModule to Hexagon object code...
             input1 = tvm.te.placeholder(shape, dtype=dtype)
             input2 = tvm.te.placeholder(shape, dtype=dtype)
             output = tvm.te.placeholder(shape, dtype=dtype)

             built_module: tvm.driver.build_module.OperatorModule = tvm.compile(
                 ns_tir_module,
                 [
                     input1,
                     input2,
                     output,
                 ],
                 get_hexagon_target("v69"),
                 name=_PRIMFUNC_NAME,
             )

             # Create an actual Hexagon-native shared object file, initially stored on the
             # host's file system...
             host_dso_binary_path = os.path.join(host_files_dir_path, "test_binary.so")
             built_module.write_to_file(host_dso_binary_path)
             print(f"SAVED BINARY TO HOST PATH: {host_dso_binary_path}")

             # Upload the .so to the Android device's file system (or wherever is appropriate
             # when using the Hexagon simulator)...
             target_dso_binary_filename = "test_binary.so"
             target_dso_binary_pathname = hexagon_session.upload(
                 host_dso_binary_path, target_dso_binary_filename
             )

             # Generate our testing / validation data...
             (
                 host_numpy_input1_data,
                 host_numpy_input2_data,
                 host_numpy_output_data_expected,
             ) = _get_elemwise_add_reference_value_tensors(shape, dtype)

             # On the target device / simulator, make our Hexagon-native shared object
             # available for use...
             loaded_hexagon_module: tvm.runtime.module.Module = hexagon_session.load_module(
                 target_dso_binary_pathname
             )

             # Create the target-side tensors to hold the primfunc's inputs and outputs...
             input1_data = tvm.runtime.empty(shape, dtype, hexagon_session.device, mem_scope)
             input2_data = tvm.runtime.empty(shape, dtype, hexagon_session.device, mem_scope)
             output_data = tvm.runtime.empty(shape, dtype, hexagon_session.device, mem_scope)

             # Populate the primfunc's input tensors...
             input1_data.copyfrom(host_numpy_input1_data)
             input2_data.copyfrom(host_numpy_input2_data)

             # Actually benchmark the primfunc...
             timer = loaded_hexagon_module.time_evaluator(
                 "main", hexagon_session.device, number=10, repeat=1
             )
             timing_result = timer(input1_data, input2_data, output_data)

             print(f"TIMING RESULT: {timing_result}")
             log_file.write(f"TIMING RESULT: {timing_result}\n")

             # Verify that the computation actually happened, and produced the correct result.
             result = output_data.numpy()

             if dtype == "float16":
                 # These are the closest tolerance we currently expect / require for these
                 # kernels.  They may be changed in the future.
                 rel_tolerance = 0.005
                 abs_tolerance = 2.0
             elif dtype == "int8":
                 rel_tolerance = 0
                 abs_tolerance = 0
             else:
                 raise Exception(f"Unexpected dtype: {dtype}")

             # TODO: We're assuming that *any* assertion thrown by 'assert_allclose' is because
             # the numerical differences were too large.  But ideally this code would
             # differentiate between (a) numerical difference errors, which should simply be
             # recorded as a failed benchmark run, vs. (b) more serious errors that should
             # kill the overall script.
             try:
                 tvm.testing.assert_allclose(
                     result, host_numpy_output_data_expected, rel_tolerance, abs_tolerance
                 )
             except AssertionError as err:
                 raise bu.NumericalAccuracyException(str(err))

             _BT.record_success(timing_result, **keys_dict)

         except bu.NumericalAccuracyException as err:
             print()
             print("FAIL: Numerical accuracy error. See log file.")

             log_file.write("\n")
             log_file.write(f"FAIL: {err}\n")

             _BT.record_fail(**keys_dict, comments="Numerical accuracy error. See log file.")

         except bu.UnsupportedException as err:
             print()
             print(f"SKIP: {err}")

             log_file.write("\n")
             log_file.write(f"SKIP: {err}\n")

             _BT.record_skip(**keys_dict, comments=f"Unsupported configuration: {err}")


 def _get_elemwise_add_reference_value_tensors(shape: list, dtype: str):
     """
     Return [A:np.array, B:np.array, C:np.array]

     `A`, `B`, and `C` are reference data used to exercise and validate
     an elementwise-add kernel: C = A+B.

     NOTE: These data are primarily meant for performance testing.
     The values may be helpful in detecting correctness issues, but that's
     a secondary consideration here.
     """
     assert len(shape) == 2

     input1 = np.ndarray(shape, dtype=dtype)
     input2 = np.ndarray(shape, dtype=dtype)

     np_dtype = input1.dtype

     if np_dtype.kind in ["i", "u"]:
         # We allow overflow for integer types because it tends to be well-behaved
         # and well-understood...
         min_value = np.iinfo(np_dtype).min
         max_value = np.iinfo(np_dtype).max

         next_value = min_value

         for i in range(shape[0]):
             for j in range(shape[1]):
                 input1[i, j] = next_value
                 input2[i, j] = next_value * 2
                 next_value += 1

     elif np_dtype.kind == "f":
         # NOTE: For simplicity, we avoid test data that require
         # well-defined behavior on floating-point overflow.
         # But it may be reasonable to test that in the future.
         min_value = np.finfo(np_dtype).min
         max_value = np.finfo(np_dtype).max

         min_input_value = min_value / 2.0 + 1
         max_input_value = max_value / 2.0 - 2
         delta = (max_input_value - min_input_value) / (shape[0] * shape[1])

         next_value = min_input_value

         for i in range(shape[0]):
             for j in range(shape[1]):
                 input1[i, j] = next_value
                 input2[i, j] = next_value + 1
                 next_value += delta

     else:
         assert False, f"Unexpected data type: {np_dtype}"

     output = input1 + input2
     return [
         input1,
         input2,
         output,
     ]


 @pytest.mark.skipif(_SHOULD_SKIP_BENCHMARKS, reason=_SKIP_BENCHMARKS_REASON)
 @tvm.testing.requires_hexagon
 def test_elemwise_add(hexagon_session: Session):
     """Main elementwise add test function"""
     for dtype in [
         "int8",
         "float16",
     ]:
         for mem_scope in [
             "global",
             "global.vtcm",
         ]:
             # These numbers are fairly arbitrary, but they're meant to stress memory/caches to
             # various extents.
             for num_vectors_per_tensor in [
                 1,
                 16,
                 64,
                 512,
                 2048,
             ]:
                 dtype_bits = tvm.runtime.DataType(dtype).bits
                 assert dtype_bits % 8 == 0
                 dtype_bytes = dtype_bits // 8

                 elem_per_hvx_vector = HVX_VECTOR_BYTES // dtype_bytes

                 shape = [
                     num_vectors_per_tensor,
                     elem_per_hvx_vector,
                 ]

                 print()
                 _benchmark_hexagon_elementwise_add_kernel(hexagon_session, shape, dtype, mem_scope)

     print("-" * 80)
     print(f"OUTPUT DIRECTORY: {_HOST_OUTPUT_DIR}")
     print("-" * 80)
     print()

     tabular_output_filename = os.path.join(_HOST_OUTPUT_DIR, "benchmark-results.csv")
     with open(tabular_output_filename, "w", encoding="UTF-8") as csv_file:
         _BT.print_csv(csv_file, _CSV_COLUMN_ORDER)

     print(f"BENCHMARK RESULTS FILE: {tabular_output_filename}")

     _BT.print_csv(sys.stdout, _CSV_COLUMN_ORDER)

     if _BT.has_fail() > 0:
         pytest.fail("At least one benchmark configuration failed", pytrace=False)


 if __name__ == "__main__":
     tvm.testing.main()
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	""" benchmark_elemwise_add """

	import os
	import os.path
	import sys
	import tempfile

	import numpy as np
	import pytest

	import tvm.script
	import tvm.testing
	from tvm.contrib.hexagon.session import Session
	from tvm.script import tir as T

	from . import benchmark_util as bu
	from .infrastructure import get_hexagon_target

	_SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_benchmarks_flag_and_reason()

	# This is a fixed detail of the v68 architecture.
	HVX_VECTOR_BYTES = 128

	# NOTE on server ports:
	# These tests use different port numbers for the RPC server (7070 + ...).
	# The reason is that an RPC session cannot be gracefully closed without
	# triggering TIME_WAIT state on the server socket. This prevents another
	# server to bind to the same port until the wait time elapses.

	_BT = bu.BenchmarksTable()

	_CSV_COLUMN_ORDER = [
	# Identifies which TE-compute / TIRScript is used as the basis for the
	# benchmarked primfunc. Only needs to be meaningful to humans.
	"basic_kernel",
	# The tensors' element type
	"dtype",
	# When applicable, indicates the particular variation of schedules
	# apply by the Python code. Decoding this may require looking at this
	# script's source code.
	"sched_type",
	# The memory location of the tensors used during the execution of
	# the primfunc. We currently assume just one location.
	# This will likely need to be generalized as we add more sophisticated
	# primfuncs.
	"mem_scope",
	# For primfuncs that treat tensor buffers as collections of 1D vectors,
	# this is the number of vectors in each tensor.
	# This will likely need to be generalized as we add more sophisticated
	# primfuncs.
	"num_vectors_per_tensor",
	# Reserved columns defined by the BenchmarksTable class.
	"row_status",
	"timings_min_usecs",
	"timings_max_usecs",
	"timings_median_usecs",
	"timings_mean_usecs",
	"timings_stddev_usecs",
	# For benchmarks that produce files on the host file system, this indicates
	# their location. Useful for post-mortem investigation of benchmark results.
	"host_files_dir_path",
	# Miscellaneous comments about the benchmark.
	"comments",
	]

	_HOST_OUTPUT_DIR = tempfile.mkdtemp()

	_PRIMFUNC_NAME = "elemwise_add"

	print("-" * 80)
	print("OUTPUT DIRECTORY: {}".format(_HOST_OUTPUT_DIR))
	print("-" * 80)
	print()


	def _get_irmod_elemwise_add(shape: list, dtype: str, mem_scope: str) -> tvm.ir.module.IRModule:
	"""
	Return an IRModule containing a single primfunc, expressed as NS-TIR.

	The primfunc implements elementwise-add. Its signature is (A,B,C), where
	A and B are the input tensors, and C is the output tensor.
	All three tensors have the specfied shape, dtype, and mem_scope.

	If the specified primfunc is known to be unsupported, raise an UnsupportedExcetion.
	"""
	assert len(shape) == 2

	# TVMScript can reference simple Python variables, but it doesn't
	# curently support more complex Python expressions...
	(
	dim0_size,
	dim1_size,
	) = shape

	if mem_scope == "global.vtcm":
	raise bu.UnsupportedException("This benchmark kernel does not yet support VTCM buffers.")

	# This check is currently elided by the one above, but it should become relevant as soon
	# as we add VTCM support to this kernel generator.
	#
	# Also: The VTCM budget is a very rough estimate, based only on experience.
	# Assuming that it's even reasonable to use a hard-coded estimate AT ALL, this number
	# may need tweaking.

	# The below code is commented is commented to avoid unreachable error
	# with pylint. Please enable this once the kernel starts supporting
	# VTCM buffers

	# Code starts below:
	# ---- ------ -----
	# estimated_vtcm_budget_bytes = HVX_VECTOR_BYTES * 1024

	# dtype_bits = tvm.runtime.DataType(dtype).bits
	# assert dtype_bits % 8 == 0
	# dtype_bytes = dtype_bits // 8

	# num_vtcm_tensors = 3
	# estimated_vtcm_needed_bytes = shape[0] * shape[1] * dtype_bytes * num_vtcm_tensors

	# if estimated_vtcm_needed_bytes > estimated_vtcm_budget_bytes:
	# raise bu.UnsupportedException("Expect to exceed VTCM budget.")

	@tvm.script.ir_module
	class BenchmarkModule:
	"""Elementwise STIR module for benchmarking"""

	# pylint: disable=no-self-argument,invalid-name,missing-function-docstring
	@T.prim_func
	def main(a: T.handle, b: T.handle, c: T.handle):
	# We exchange data between function by handles, which are similar to pointer.
	T.func_attr({"global_symbol": "main", "tir.noalias": True})

	A = T.match_buffer(a, shape, dtype=dtype)
	B = T.match_buffer(b, shape, dtype=dtype)
	C = T.match_buffer(c, shape, dtype=dtype)

	for i in range(dim0_size):
	for j in range(dim1_size):
	C[i, j] = A[i, j] + B[i, j]

	# pylint: enable=no-self-argument,invalid-name,missing-function-docstring

	return BenchmarkModule


	def _benchmark_hexagon_elementwise_add_kernel(
	hexagon_session: Session, shape: list, dtype: str, mem_scope: str
	):
	"""
	Generate and benchmark a single elementwise-add kernel for Hexagon.

	Produce these outputs:
	- Printed status updates / results to stdout and/or stderr.

	- Create a new subdirectory under _HOST_OUTPUT_DIR, and populate it with
	various logs and intermediate files.

	- Add to _BT a row describing this benchmark run.
	"""
	# Represent the benchmark details in a form required by the benchmark table
	# and for other logging...
	keys_dict = {
	"basic_kernel": "ewise-add",
	"dtype": dtype,
	"shape": shape,
	"mem_scope": mem_scope,
	}

	desc = bu.get_benchmark_decription(keys_dict)

	# Create the host-side directory for this benchmark run's files / logs...
	host_files_dir_name = bu.get_benchmark_id(keys_dict)
	host_files_dir_path = os.path.join(_HOST_OUTPUT_DIR, host_files_dir_name)
	os.mkdir(host_files_dir_path)

	keys_dict["host_files_dir_path"] = host_files_dir_path

	log_file_path = os.path.join(host_files_dir_path, "out.txt")
	with open(log_file_path, "w", encoding="UTF-8") as log_file:
	print(f"CONFIGURATION: {desc}")
	log_file.write(f"CONFIGURATION: {desc}\n")

	try:
	ns_tir_module = _get_irmod_elemwise_add(shape, dtype, mem_scope)

	# Lower the primfunc's IRModule to Hexagon object code...
	input1 = tvm.te.placeholder(shape, dtype=dtype)
	input2 = tvm.te.placeholder(shape, dtype=dtype)
	output = tvm.te.placeholder(shape, dtype=dtype)

	built_module: tvm.driver.build_module.OperatorModule = tvm.compile(
	ns_tir_module,
	[
	input1,
	input2,
	output,
	],
	get_hexagon_target("v69"),
	name=_PRIMFUNC_NAME,
	)

	# Create an actual Hexagon-native shared object file, initially stored on the
	# host's file system...
	host_dso_binary_path = os.path.join(host_files_dir_path, "test_binary.so")
	built_module.write_to_file(host_dso_binary_path)
	print(f"SAVED BINARY TO HOST PATH: {host_dso_binary_path}")

	# Upload the .so to the Android device's file system (or wherever is appropriate
	# when using the Hexagon simulator)...
	target_dso_binary_filename = "test_binary.so"
	target_dso_binary_pathname = hexagon_session.upload(
	host_dso_binary_path, target_dso_binary_filename
	)

	# Generate our testing / validation data...
	(
	host_numpy_input1_data,
	host_numpy_input2_data,
	host_numpy_output_data_expected,
	) = _get_elemwise_add_reference_value_tensors(shape, dtype)

	# On the target device / simulator, make our Hexagon-native shared object
	# available for use...
	loaded_hexagon_module: tvm.runtime.module.Module = hexagon_session.load_module(
	target_dso_binary_pathname
	)

	# Create the target-side tensors to hold the primfunc's inputs and outputs...
	input1_data = tvm.runtime.empty(shape, dtype, hexagon_session.device, mem_scope)
	input2_data = tvm.runtime.empty(shape, dtype, hexagon_session.device, mem_scope)
	output_data = tvm.runtime.empty(shape, dtype, hexagon_session.device, mem_scope)

	# Populate the primfunc's input tensors...
	input1_data.copyfrom(host_numpy_input1_data)
	input2_data.copyfrom(host_numpy_input2_data)

	# Actually benchmark the primfunc...
	timer = loaded_hexagon_module.time_evaluator(
	"main", hexagon_session.device, number=10, repeat=1
	)
	timing_result = timer(input1_data, input2_data, output_data)

	print(f"TIMING RESULT: {timing_result}")
	log_file.write(f"TIMING RESULT: {timing_result}\n")

	# Verify that the computation actually happened, and produced the correct result.
	result = output_data.numpy()

	if dtype == "float16":
	# These are the closest tolerance we currently expect / require for these
	# kernels. They may be changed in the future.
	rel_tolerance = 0.005
	abs_tolerance = 2.0
	elif dtype == "int8":
	rel_tolerance = 0
	abs_tolerance = 0
	else:
	raise Exception(f"Unexpected dtype: {dtype}")

	# TODO: We're assuming that any assertion thrown by 'assert_allclose' is because
	# the numerical differences were too large. But ideally this code would
	# differentiate between (a) numerical difference errors, which should simply be
	# recorded as a failed benchmark run, vs. (b) more serious errors that should
	# kill the overall script.
	try:
	tvm.testing.assert_allclose(
	result, host_numpy_output_data_expected, rel_tolerance, abs_tolerance
	)
	except AssertionError as err:
	raise bu.NumericalAccuracyException(str(err))

	_BT.record_success(timing_result, **keys_dict)

	except bu.NumericalAccuracyException as err:
	print()
	print("FAIL: Numerical accuracy error. See log file.")

	log_file.write("\n")
	log_file.write(f"FAIL: {err}\n")

	_BT.record_fail(**keys_dict, comments="Numerical accuracy error. See log file.")

	except bu.UnsupportedException as err:
	print()
	print(f"SKIP: {err}")

	log_file.write("\n")
	log_file.write(f"SKIP: {err}\n")

	_BT.record_skip(**keys_dict, comments=f"Unsupported configuration: {err}")


	def _get_elemwise_add_reference_value_tensors(shape: list, dtype: str):
	"""
	Return [A:np.array, B:np.array, C:np.array]

	`A`, `B`, and `C` are reference data used to exercise and validate
	an elementwise-add kernel: C = A+B.

	NOTE: These data are primarily meant for performance testing.
	The values may be helpful in detecting correctness issues, but that's
	a secondary consideration here.
	"""
	assert len(shape) == 2

	input1 = np.ndarray(shape, dtype=dtype)
	input2 = np.ndarray(shape, dtype=dtype)

	np_dtype = input1.dtype

	if np_dtype.kind in ["i", "u"]:
	# We allow overflow for integer types because it tends to be well-behaved
	# and well-understood...
	min_value = np.iinfo(np_dtype).min
	max_value = np.iinfo(np_dtype).max

	next_value = min_value

	for i in range(shape[0]):
	for j in range(shape[1]):
	input1[i, j] = next_value
	input2[i, j] = next_value * 2
	next_value += 1

	elif np_dtype.kind == "f":
	# NOTE: For simplicity, we avoid test data that require
	# well-defined behavior on floating-point overflow.
	# But it may be reasonable to test that in the future.
	min_value = np.finfo(np_dtype).min
	max_value = np.finfo(np_dtype).max

	min_input_value = min_value / 2.0 + 1
	max_input_value = max_value / 2.0 - 2
	delta = (max_input_value - min_input_value) / (shape[0] * shape[1])

	next_value = min_input_value

	for i in range(shape[0]):
	for j in range(shape[1]):
	input1[i, j] = next_value
	input2[i, j] = next_value + 1
	next_value += delta

	else:
	assert False, f"Unexpected data type: {np_dtype}"

	output = input1 + input2
	return [
	input1,
	input2,
	output,
	]


	@pytest.mark.skipif(_SHOULD_SKIP_BENCHMARKS, reason=_SKIP_BENCHMARKS_REASON)
	@tvm.testing.requires_hexagon
	def test_elemwise_add(hexagon_session: Session):
	"""Main elementwise add test function"""
	for dtype in [
	"int8",
	"float16",
	]:
	for mem_scope in [
	"global",
	"global.vtcm",
	]:
	# These numbers are fairly arbitrary, but they're meant to stress memory/caches to
	# various extents.
	for num_vectors_per_tensor in [
	1,
	16,
	64,
	512,
	2048,
	]:
	dtype_bits = tvm.runtime.DataType(dtype).bits
	assert dtype_bits % 8 == 0
	dtype_bytes = dtype_bits // 8

	elem_per_hvx_vector = HVX_VECTOR_BYTES // dtype_bytes

	shape = [
	num_vectors_per_tensor,
	elem_per_hvx_vector,
	]

	print()
	_benchmark_hexagon_elementwise_add_kernel(hexagon_session, shape, dtype, mem_scope)

	print("-" * 80)
	print(f"OUTPUT DIRECTORY: {_HOST_OUTPUT_DIR}")
	print("-" * 80)
	print()

	tabular_output_filename = os.path.join(_HOST_OUTPUT_DIR, "benchmark-results.csv")
	with open(tabular_output_filename, "w", encoding="UTF-8") as csv_file:
	_BT.print_csv(csv_file, _CSV_COLUMN_ORDER)

	print(f"BENCHMARK RESULTS FILE: {tabular_output_filename}")

	_BT.print_csv(sys.stdout, _CSV_COLUMN_ORDER)

	if _BT.has_fail() > 0:
	pytest.fail("At least one benchmark configuration failed", pytrace=False)


	if __name__ == "__main__":
	tvm.testing.main()