| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| """Testing topi conv2d_transpose operator for VTA""" |
| |
| import json |
| import os |
| |
| import numpy as np |
| from collections import namedtuple |
| |
| import tvm |
| from tvm import autotvm |
| from tvm.contrib import util |
| from tvm.contrib.pickle_memoize import memoize |
| import topi |
| import topi.testing |
| import vta |
| from vta import program_fpga, reconfig_runtime |
| import vta.testing |
| from vta.testing import simulator |
| |
| |
| Workload = namedtuple("Conv2DTransposeWorkload", |
| ['batch', 'height', 'width', 'in_filter', 'out_filter', |
| 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride']) |
| |
| # Get batch info from env |
| env = vta.get_env() |
| |
| # DCGAN workloads |
| dcgan_wklds = [ |
| # dcgan |
| ('DCGAN.CT1', Workload(env.BATCH, 4, 4, 1024, 512, 4, 4, 1, 1, 2, 2)), |
| ('DCGAN.CT2', Workload(env.BATCH, 8, 8, 512, 256, 4, 4, 1, 1, 2, 2)), |
| ('DCGAN.CT3', Workload(env.BATCH, 16, 16, 256, 128, 4, 4, 1, 1, 2, 2)), |
| ] |
| |
| # FIXME: we need a custom clip operator to circumvent a pattern detection limitation |
| @tvm.tag_scope(tag=topi.tag.ELEMWISE) |
| def my_clip(x, a_min, a_max): |
| """Unlike topi's current clip, put min and max into two stages.""" |
| const_min = tvm.const(a_min, x.dtype) |
| const_max = tvm.const(a_max, x.dtype) |
| x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA") |
| x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB") |
| return x |
| |
| # Helper function to get factors |
| def _find_factors(n): |
| factors = [] |
| for f in range(1, n + 1): |
| if n % f == 0: |
| factors.append(f) |
| return factors |
| |
| |
| def run_conv2d_transpose(env, remote, wl, target, |
| check_correctness=True, print_ir=False, |
| samples=4): |
| |
| # Workload assertions |
| assert wl.hpad == wl.wpad |
| |
| # Perform packing only if we are targeting the accelerator |
| if "arm_cpu" in target.keys: |
| data_pack = False |
| layout = "NCHW" |
| elif "vta" in target.keys: |
| data_pack = True |
| layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN) |
| |
| # Derive shapes depending upon packing |
| |
| a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) |
| w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) |
| if data_pack: |
| data_shape = (wl.batch//env.BATCH, wl.in_filter//env.BLOCK_IN, |
| wl.height, wl.width, env.BATCH, env.BLOCK_IN) |
| kernel_shape = (wl.out_filter//env.BLOCK_OUT, wl.in_filter//env.BLOCK_IN, |
| wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN) |
| else: |
| data_shape = a_shape |
| kernel_shape = w_shape |
| data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype) |
| kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) |
| |
| # Define base computation schedule |
| with target: |
| res = topi.nn.conv2d_transpose_nchw( |
| data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), env.acc_dtype) |
| res = topi.right_shift(res, env.WGT_WIDTH) |
| res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) |
| res = topi.cast(res, env.out_dtype) |
| # Derive base schedule |
| s = topi.generic.schedule_conv2d_transpose_nchw([res]) |
| if print_ir: |
| print(vta.lower(s, [data, kernel, res], simple_mode=True)) |
| |
| # Derive number of ops |
| fout_height = (wl.height - 1) * wl.hstride - 2 * wl.hpad + wl.hkernel |
| fout_width = (wl.width - 1) * wl.wstride - 2 * wl.wpad + wl.wkernel |
| num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter |
| |
| # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc") |
| def get_ref_data(): |
| # derive min max for act and wgt types (max non inclusive) |
| a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) |
| w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1)) |
| a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype) |
| w_np = np.random.randint(w_min, w_max, size=(wl.in_filter, wl.out_filter, wl.hkernel, wl.wkernel)).astype(kernel.dtype) |
| r_np = topi.testing.conv2d_transpose_nchw_python( |
| a_np.astype(env.acc_dtype), w_np.astype(env.acc_dtype), (wl.hstride, wl.wstride), wl.hpad).astype(env.acc_dtype) |
| return a_np, w_np, r_np |
| |
| # Data in original format |
| data_np, kernel_np, res_ref = get_ref_data() |
| if data_pack: |
| data_np = data_np.reshape( |
| wl.batch//env.BATCH, env.BATCH, |
| wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, |
| wl.height, wl.width).transpose((0, 2, 4, 5, 1, 3)) |
| kernel_np = kernel_np.reshape( |
| wl.in_filter//env.BLOCK_IN, env.BLOCK_IN, |
| wl.out_filter//env.BLOCK_OUT, env.BLOCK_OUT, |
| wl.hkernel, wl.wkernel).transpose((2, 0, 4, 5, 3, 1)) |
| kernel_np = np.flip(kernel_np, 2) |
| kernel_np = np.flip(kernel_np, 3) |
| |
| # Build |
| if "vta" in target.keys: |
| mod = vta.build(s, [data, kernel, res], |
| target=target, |
| target_host=env.target_host, |
| name="conv2d_transpose") |
| else: |
| mod = tvm.build(s, [data, kernel, res], |
| target=target, |
| target_host=env.target_host, |
| name="conv2d_transpose") |
| temp = util.tempdir() |
| mod.save(temp.relpath("conv2d_transpose.o")) |
| remote.upload(temp.relpath("conv2d_transpose.o")) |
| f = remote.load_module("conv2d_transpose.o") |
| ctx = remote.context(str(target)) |
| |
| res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype) |
| data_arr = tvm.nd.array(data_np, ctx) |
| kernel_arr = tvm.nd.array(kernel_np, ctx) |
| res_arr = tvm.nd.array(res_np, ctx) |
| time_f = f.time_evaluator("conv2d_transpose", ctx, number=samples) |
| |
| # In vta sim mode, collect simulator runtime statistics |
| stats = {} |
| cost = None |
| if env.TARGET in ["sim", "tsim"]: |
| # Check if we're in local RPC mode (allows us to rebuild the |
| # runtime on the fly when varying the VTA designs) |
| local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) |
| if local_rpc: |
| if env.TARGET == "sim": |
| remote.get_function("vta.simulator.profiler_clear")() |
| else: |
| remote.get_function("vta.tsim.profiler_clear")() |
| cost = time_f(data_arr, kernel_arr, res_arr) |
| if env.TARGET == "sim": |
| stats = json.loads(remote.get_function("vta.simulator.profiler_status")()) |
| else: |
| stats = json.loads(remote.get_function("vta.tsim.profiler_status")()) |
| else: |
| simulator.clear_stats() |
| cost = time_f(data_arr, kernel_arr, res_arr) |
| stats = simulator.stats() |
| else: |
| cost = time_f(data_arr, kernel_arr, res_arr) |
| |
| # Check correctness |
| correct = False |
| if check_correctness: |
| res_orig = res_arr.asnumpy() |
| if data_pack: |
| res_orig = res_orig.transpose( |
| (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, fout_height, fout_width) |
| res_ref = res_ref >> env.WGT_WIDTH |
| res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1) |
| res_ref = res_ref.astype(env.out_dtype) |
| correct = np.allclose(res_orig, res_ref) |
| |
| gops = (num_ops / cost.mean) / float(10 ** 9) |
| status = "PASSED" if correct else "FAILED" |
| if "arm_cpu" in target.keys: |
| device = "CPU" |
| elif "vta" in target.keys: |
| device = "VTA" |
| print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops)) |
| |
| return correct, cost, stats |
| |
| def test_conv2d_transpose(device="vta"): |
| def _run(env, remote): |
| if device == "vta": |
| target = env.target |
| if env.TARGET not in ["sim", "tsim"]: |
| assert tvm.module.enabled("rpc") |
| program_fpga(remote, bitstream=None) |
| reconfig_runtime(remote) |
| elif device == "arm_cpu": |
| target = env.target_vta_cpu |
| with autotvm.tophub.context(target): # load pre-tuned schedule parameters |
| for _, wl in dcgan_wklds: |
| print(wl) |
| run_conv2d_transpose(env, remote, wl, target) |
| vta.testing.run(_run) |
| |
| if __name__ == "__main__": |
| # test_conv2d_transpose(device="arm_cpu") |
| test_conv2d_transpose(device="vta") |