blob: dc77b7ad39a472223107e8dadc8915068a8d66cc [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
""" benchmark_elemwise_add """
import os
import os.path
import sys
import tempfile
import numpy as np
import pytest
import tvm.script
import tvm.testing
from tvm.contrib.hexagon.session import Session
from tvm.script import tir as T
from . import benchmark_util as bu
from .infrastructure import get_hexagon_target
_SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_benchmarks_flag_and_reason()
# This is a fixed detail of the v68 architecture.
HVX_VECTOR_BYTES = 128
# NOTE on server ports:
# These tests use different port numbers for the RPC server (7070 + ...).
# The reason is that an RPC session cannot be gracefully closed without
# triggering TIME_WAIT state on the server socket. This prevents another
# server to bind to the same port until the wait time elapses.
_BT = bu.BenchmarksTable()
_CSV_COLUMN_ORDER = [
# Identifies which TE-compute / TIRScript is used as the basis for the
# benchmarked primfunc. Only needs to be meaningful to humans.
"basic_kernel",
# The tensors' element type
"dtype",
# When applicable, indicates the particular variation of schedules
# apply by the Python code. Decoding this may require looking at this
# script's source code.
"sched_type",
# The memory location of the tensors used during the execution of
# the primfunc. We currently assume just one location.
# This will likely need to be generalized as we add more sophisticated
# primfuncs.
"mem_scope",
# For primfuncs that treat tensor buffers as collections of 1D vectors,
# this is the number of vectors in each tensor.
# This will likely need to be generalized as we add more sophisticated
# primfuncs.
"num_vectors_per_tensor",
# Reserved columns defined by the BenchmarksTable class.
"row_status",
"timings_min_usecs",
"timings_max_usecs",
"timings_median_usecs",
"timings_mean_usecs",
"timings_stddev_usecs",
# For benchmarks that produce files on the host file system, this indicates
# their location. Useful for post-mortem investigation of benchmark results.
"host_files_dir_path",
# Miscellaneous comments about the benchmark.
"comments",
]
_HOST_OUTPUT_DIR = tempfile.mkdtemp()
_PRIMFUNC_NAME = "elemwise_add"
print("-" * 80)
print("OUTPUT DIRECTORY: {}".format(_HOST_OUTPUT_DIR))
print("-" * 80)
print()
def _get_irmod_elemwise_add(shape: list, dtype: str, mem_scope: str) -> tvm.ir.module.IRModule:
"""
Return an IRModule containing a single primfunc, expressed as NS-TIR.
The primfunc implements elementwise-add. Its signature is (A,B,C), where
A and B are the input tensors, and C is the output tensor.
All three tensors have the specfied shape, dtype, and mem_scope.
If the specified primfunc is known to be unsupported, raise an UnsupportedExcetion.
"""
assert len(shape) == 2
# TVMScript can reference simple Python variables, but it doesn't
# curently support more complex Python expressions...
(
dim0_size,
dim1_size,
) = shape
if mem_scope == "global.vtcm":
raise bu.UnsupportedException("This benchmark kernel does not yet support VTCM buffers.")
# This check is currently elided by the one above, but it should become relevant as soon
# as we add VTCM support to this kernel generator.
#
# Also: The VTCM budget is a very rough estimate, based only on experience.
# Assuming that it's even reasonable to use a hard-coded estimate AT ALL, this number
# may need tweaking.
# The below code is commented is commented to avoid unreachable error
# with pylint. Please enable this once the kernel starts supporting
# VTCM buffers
# Code starts below:
# ---- ------ -----
# estimated_vtcm_budget_bytes = HVX_VECTOR_BYTES * 1024
# dtype_bits = tvm.runtime.DataType(dtype).bits
# assert dtype_bits % 8 == 0
# dtype_bytes = dtype_bits // 8
# num_vtcm_tensors = 3
# estimated_vtcm_needed_bytes = shape[0] * shape[1] * dtype_bytes * num_vtcm_tensors
# if estimated_vtcm_needed_bytes > estimated_vtcm_budget_bytes:
# raise bu.UnsupportedException("Expect to exceed VTCM budget.")
@tvm.script.ir_module
class BenchmarkModule:
"""Elementwise STIR module for benchmarking"""
# pylint: disable=no-self-argument,invalid-name,missing-function-docstring
@T.prim_func
def main(a: T.handle, b: T.handle, c: T.handle):
# We exchange data between function by handles, which are similar to pointer.
T.func_attr({"global_symbol": "main", "tir.noalias": True})
A = T.match_buffer(a, shape, dtype=dtype)
B = T.match_buffer(b, shape, dtype=dtype)
C = T.match_buffer(c, shape, dtype=dtype)
for i in range(dim0_size):
for j in range(dim1_size):
C[i, j] = A[i, j] + B[i, j]
# pylint: enable=no-self-argument,invalid-name,missing-function-docstring
return BenchmarkModule
def _benchmark_hexagon_elementwise_add_kernel(
hexagon_session: Session, shape: list, dtype: str, mem_scope: str
):
"""
Generate and benchmark a single elementwise-add kernel for Hexagon.
Produce these outputs:
- Printed status updates / results to stdout and/or stderr.
- Create a new subdirectory under _HOST_OUTPUT_DIR, and populate it with
various logs and intermediate files.
- Add to _BT a row describing this benchmark run.
"""
# Represent the benchmark details in a form required by the benchmark table
# and for other logging...
keys_dict = {
"basic_kernel": "ewise-add",
"dtype": dtype,
"shape": shape,
"mem_scope": mem_scope,
}
desc = bu.get_benchmark_decription(keys_dict)
# Create the host-side directory for this benchmark run's files / logs...
host_files_dir_name = bu.get_benchmark_id(keys_dict)
host_files_dir_path = os.path.join(_HOST_OUTPUT_DIR, host_files_dir_name)
os.mkdir(host_files_dir_path)
keys_dict["host_files_dir_path"] = host_files_dir_path
log_file_path = os.path.join(host_files_dir_path, "out.txt")
with open(log_file_path, "w", encoding="UTF-8") as log_file:
print(f"CONFIGURATION: {desc}")
log_file.write(f"CONFIGURATION: {desc}\n")
try:
ns_tir_module = _get_irmod_elemwise_add(shape, dtype, mem_scope)
# Lower the primfunc's IRModule to Hexagon object code...
input1 = tvm.te.placeholder(shape, dtype=dtype)
input2 = tvm.te.placeholder(shape, dtype=dtype)
output = tvm.te.placeholder(shape, dtype=dtype)
built_module: tvm.driver.build_module.OperatorModule = tvm.compile(
ns_tir_module,
[
input1,
input2,
output,
],
get_hexagon_target("v69"),
name=_PRIMFUNC_NAME,
)
# Create an actual Hexagon-native shared object file, initially stored on the
# host's file system...
host_dso_binary_path = os.path.join(host_files_dir_path, "test_binary.so")
built_module.write_to_file(host_dso_binary_path)
print(f"SAVED BINARY TO HOST PATH: {host_dso_binary_path}")
# Upload the .so to the Android device's file system (or wherever is appropriate
# when using the Hexagon simulator)...
target_dso_binary_filename = "test_binary.so"
target_dso_binary_pathname = hexagon_session.upload(
host_dso_binary_path, target_dso_binary_filename
)
# Generate our testing / validation data...
(
host_numpy_input1_data,
host_numpy_input2_data,
host_numpy_output_data_expected,
) = _get_elemwise_add_reference_value_tensors(shape, dtype)
# On the target device / simulator, make our Hexagon-native shared object
# available for use...
loaded_hexagon_module: tvm.runtime.module.Module = hexagon_session.load_module(
target_dso_binary_pathname
)
# Create the target-side tensors to hold the primfunc's inputs and outputs...
input1_data = tvm.runtime.empty(shape, dtype, hexagon_session.device, mem_scope)
input2_data = tvm.runtime.empty(shape, dtype, hexagon_session.device, mem_scope)
output_data = tvm.runtime.empty(shape, dtype, hexagon_session.device, mem_scope)
# Populate the primfunc's input tensors...
input1_data.copyfrom(host_numpy_input1_data)
input2_data.copyfrom(host_numpy_input2_data)
# Actually benchmark the primfunc...
timer = loaded_hexagon_module.time_evaluator(
"main", hexagon_session.device, number=10, repeat=1
)
timing_result = timer(input1_data, input2_data, output_data)
print(f"TIMING RESULT: {timing_result}")
log_file.write(f"TIMING RESULT: {timing_result}\n")
# Verify that the computation actually happened, and produced the correct result.
result = output_data.numpy()
if dtype == "float16":
# These are the closest tolerance we currently expect / require for these
# kernels. They may be changed in the future.
rel_tolerance = 0.005
abs_tolerance = 2.0
elif dtype == "int8":
rel_tolerance = 0
abs_tolerance = 0
else:
raise Exception(f"Unexpected dtype: {dtype}")
# TODO: We're assuming that *any* assertion thrown by 'assert_allclose' is because
# the numerical differences were too large. But ideally this code would
# differentiate between (a) numerical difference errors, which should simply be
# recorded as a failed benchmark run, vs. (b) more serious errors that should
# kill the overall script.
try:
tvm.testing.assert_allclose(
result, host_numpy_output_data_expected, rel_tolerance, abs_tolerance
)
except AssertionError as err:
raise bu.NumericalAccuracyException(str(err))
_BT.record_success(timing_result, **keys_dict)
except bu.NumericalAccuracyException as err:
print()
print("FAIL: Numerical accuracy error. See log file.")
log_file.write("\n")
log_file.write(f"FAIL: {err}\n")
_BT.record_fail(**keys_dict, comments="Numerical accuracy error. See log file.")
except bu.UnsupportedException as err:
print()
print(f"SKIP: {err}")
log_file.write("\n")
log_file.write(f"SKIP: {err}\n")
_BT.record_skip(**keys_dict, comments=f"Unsupported configuration: {err}")
def _get_elemwise_add_reference_value_tensors(shape: list, dtype: str):
"""
Return [A:np.array, B:np.array, C:np.array]
`A`, `B`, and `C` are reference data used to exercise and validate
an elementwise-add kernel: C = A+B.
NOTE: These data are primarily meant for performance testing.
The values may be helpful in detecting correctness issues, but that's
a secondary consideration here.
"""
assert len(shape) == 2
input1 = np.ndarray(shape, dtype=dtype)
input2 = np.ndarray(shape, dtype=dtype)
np_dtype = input1.dtype
if np_dtype.kind in ["i", "u"]:
# We allow overflow for integer types because it tends to be well-behaved
# and well-understood...
min_value = np.iinfo(np_dtype).min
max_value = np.iinfo(np_dtype).max
next_value = min_value
for i in range(shape[0]):
for j in range(shape[1]):
input1[i, j] = next_value
input2[i, j] = next_value * 2
next_value += 1
elif np_dtype.kind == "f":
# NOTE: For simplicity, we avoid test data that require
# well-defined behavior on floating-point overflow.
# But it may be reasonable to test that in the future.
min_value = np.finfo(np_dtype).min
max_value = np.finfo(np_dtype).max
min_input_value = min_value / 2.0 + 1
max_input_value = max_value / 2.0 - 2
delta = (max_input_value - min_input_value) / (shape[0] * shape[1])
next_value = min_input_value
for i in range(shape[0]):
for j in range(shape[1]):
input1[i, j] = next_value
input2[i, j] = next_value + 1
next_value += delta
else:
assert False, f"Unexpected data type: {np_dtype}"
output = input1 + input2
return [
input1,
input2,
output,
]
@pytest.mark.skipif(_SHOULD_SKIP_BENCHMARKS, reason=_SKIP_BENCHMARKS_REASON)
@tvm.testing.requires_hexagon
def test_elemwise_add(hexagon_session: Session):
"""Main elementwise add test function"""
for dtype in [
"int8",
"float16",
]:
for mem_scope in [
"global",
"global.vtcm",
]:
# These numbers are fairly arbitrary, but they're meant to stress memory/caches to
# various extents.
for num_vectors_per_tensor in [
1,
16,
64,
512,
2048,
]:
dtype_bits = tvm.runtime.DataType(dtype).bits
assert dtype_bits % 8 == 0
dtype_bytes = dtype_bits // 8
elem_per_hvx_vector = HVX_VECTOR_BYTES // dtype_bytes
shape = [
num_vectors_per_tensor,
elem_per_hvx_vector,
]
print()
_benchmark_hexagon_elementwise_add_kernel(hexagon_session, shape, dtype, mem_scope)
print("-" * 80)
print(f"OUTPUT DIRECTORY: {_HOST_OUTPUT_DIR}")
print("-" * 80)
print()
tabular_output_filename = os.path.join(_HOST_OUTPUT_DIR, "benchmark-results.csv")
with open(tabular_output_filename, "w", encoding="UTF-8") as csv_file:
_BT.print_csv(csv_file, _CSV_COLUMN_ORDER)
print(f"BENCHMARK RESULTS FILE: {tabular_output_filename}")
_BT.print_csv(sys.stdout, _CSV_COLUMN_ORDER)
if _BT.has_fail() > 0:
pytest.fail("At least one benchmark configuration failed", pytrace=False)
if __name__ == "__main__":
tvm.testing.main()