| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| """Test parallelism for multiple different scalar workloads.""" |
| |
| import numpy as np |
| |
| import tvm |
| from tvm.script import tir as T |
| |
| from .infrastructure import get_hexagon_target |
| |
| TEST_OUTPUT_TEMPLATE = ( |
| "Test {} with {} operations... \n" |
| " -Single Thread: {} ms \n" |
| " -Parallel: {} ms\n -Speedup: {}x\n" |
| ) |
| |
| |
| def get_add_operator(operations): |
| """Generate add operator.""" |
| |
| @T.prim_func |
| def operator(a: T.handle, b: T.handle, c: T.handle) -> None: |
| T.func_attr({"global_symbol": "main", "tir.noalias": True}) |
| a_buffer = T.match_buffer(a, [operations], dtype="float64") |
| b_buffer = T.match_buffer(b, [operations], dtype="float64") |
| c_buffer = T.match_buffer(c, [operations], dtype="float64") |
| for n in T.grid(operations): |
| with T.block("c_buffer"): |
| vn_ind = T.axis.remap("S", [n]) |
| c_buffer[vn_ind] = a_buffer[vn_ind] + b_buffer[vn_ind] |
| |
| return operator |
| |
| |
| def get_multiply_operator(operations): |
| """Generate multiply operator.""" |
| |
| @T.prim_func |
| def operator(a: T.handle, b: T.handle, c: T.handle) -> None: |
| T.func_attr({"global_symbol": "main", "tir.noalias": True}) |
| a_buffer = T.match_buffer(a, [operations], dtype="float64") |
| b_buffer = T.match_buffer(b, [operations], dtype="float64") |
| c_buffer = T.match_buffer(c, [operations], dtype="float64") |
| for n in T.grid(operations): |
| with T.block("c_buffer"): |
| vn_ind = T.axis.remap("S", [n]) |
| c_buffer[vn_ind] = a_buffer[vn_ind] * b_buffer[vn_ind] |
| |
| return operator |
| |
| |
| def get_sub_operator(operations): |
| """Generate subtract operator.""" |
| |
| @T.prim_func |
| def operator(a: T.handle, b: T.handle, c: T.handle) -> None: |
| T.func_attr({"global_symbol": "main", "tir.noalias": True}) |
| a_buffer = T.match_buffer(a, [operations], dtype="float64") |
| b_buffer = T.match_buffer(b, [operations], dtype="float64") |
| c_buffer = T.match_buffer(c, [operations], dtype="float64") |
| for n in T.grid(operations): |
| with T.block("c_buffer"): |
| vn_ind = T.axis.remap("S", [n]) |
| c_buffer[vn_ind] = a_buffer[vn_ind] - b_buffer[vn_ind] |
| |
| return operator |
| |
| |
| def evaluate(hexagon_session, operations, expected, sch): |
| """Evalute schedule.""" |
| shape = operations |
| dtype = "float64" |
| |
| func_tir = tvm.compile(sch.mod["main"], target=get_hexagon_target("v68")) |
| module = hexagon_session.load_module(func_tir) |
| |
| # np.random.random returns float64 by default, but make the cast explicit |
| # to make it easier to switch when necessary. |
| a = np.random.random(shape).astype(dtype) |
| b = np.random.random(shape).astype(dtype) |
| c = np.zeros(shape, dtype=dtype) |
| |
| a_hexagon = tvm.runtime.tensor(a, device=hexagon_session.device) |
| b_hexagon = tvm.runtime.tensor(b, device=hexagon_session.device) |
| c_hexagon = tvm.runtime.tensor(c, device=hexagon_session.device) |
| |
| # These are reduced for CI but number=100 and repeat=10 does a good job of removing noise. |
| number = 1 |
| repeat = 1 |
| |
| timer = module.time_evaluator("main", hexagon_session.device, number=number, repeat=repeat) |
| runtime = timer(a_hexagon, b_hexagon, c_hexagon) |
| |
| tvm.testing.assert_allclose(c_hexagon.numpy(), expected(a, b)) |
| |
| return round(runtime.mean * 1000, 6) |
| |
| |
| class TestMatMulVec: |
| """MatMul test class.""" |
| |
| (operation_name, operator_producer, expected_output_producer,) = tvm.testing.parameters( |
| ("add", get_add_operator, (lambda a, b: a + b)), |
| ("mul", get_multiply_operator, (lambda a, b: a * b)), |
| ("sub", get_sub_operator, (lambda a, b: a - b)), |
| ) |
| |
| # Removed most of these to speedup CI. |
| operations = tvm.testing.parameter( |
| 128, |
| # 256, |
| # 512, |
| # Single thread runs faster since L2 cache can handle the entire request quickly |
| # 1024, |
| # 2048, |
| # Significant performance degredation once the inputs and outputs cannot all fit in L2 |
| # 4096, |
| # 8192, |
| # 16384, |
| ) |
| |
| split_factor = tvm.testing.parameter(4) |
| |
| @tvm.testing.requires_hexagon |
| def test_add( |
| self, |
| hexagon_session, |
| operation_name, |
| operator_producer, |
| expected_output_producer, |
| operations, |
| split_factor, |
| ): |
| """Test Add operator.""" |
| |
| sch = tvm.tir.Schedule(operator_producer(operations)) |
| single_thread_runtime = evaluate(hexagon_session, operations, expected_output_producer, sch) |
| |
| sch = tvm.tir.Schedule(operator_producer(operations)) |
| block = sch.get_block("c_buffer") |
| b = sch.get_loops(block) |
| b_output, _ = sch.split(b[0], factors=[split_factor, None]) |
| sch.parallel(b_output) |
| parallel_runtime = evaluate(hexagon_session, operations, expected_output_producer, sch) |
| |
| speedup = round(single_thread_runtime / parallel_runtime, 2) |
| print( |
| TEST_OUTPUT_TEMPLATE.format( |
| operation_name, operations, single_thread_runtime, parallel_runtime, speedup |
| ) |
| ) |
| |
| |
| if __name__ == "__main__": |
| tvm.testing.main() |