vta/tests/python/integration/test_benchmark_topi_conv2d.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 """Testing topi conv2d operator for VTA"""

 import json
 import os

 import pytest
 import numpy as np
 from collections import namedtuple

 import tvm
 from tvm import te
 from tvm import relay
 from tvm import autotvm
 from tvm.contrib import utils
 from tvm.contrib.pickle_memoize import memoize
 from tvm import topi
 import tvm.topi.testing
 import vta
 from vta import program_fpga, reconfig_runtime
 import vta.testing
 from vta.testing import simulator


 Workload = namedtuple(
     "Conv2DWorkload",
     [
         "batch",
         "height",
         "width",
         "in_filter",
         "out_filter",
         "hkernel",
         "wkernel",
         "hpad",
         "wpad",
         "hstride",
         "wstride",
     ],
 )

 # Get batch info from env
 env = vta.get_env()

 # ResNet18 workloads
 resnet_wkls = [
     # Workloads of resnet18 on imagenet
     # ('resnet-18.C1',  Workload(env.BATCH, 224, 224, 3,   64,  7, 7, 3, 3, 2, 2)),
     ("resnet-18.C2", Workload(env.BATCH, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1)),
     ("resnet-18.C3", Workload(env.BATCH, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2)),
     ("resnet-18.C4", Workload(env.BATCH, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2)),
     ("resnet-18.C5", Workload(env.BATCH, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1)),
     ("resnet-18.C6", Workload(env.BATCH, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2)),
     ("resnet-18.C7", Workload(env.BATCH, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2)),
     ("resnet-18.C8", Workload(env.BATCH, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1)),
     ("resnet-18.C9", Workload(env.BATCH, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2)),
     ("resnet-18.C10", Workload(env.BATCH, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2)),
     ("resnet-18.C11", Workload(env.BATCH, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1)),
 ]

 # FIXME: we need a custom clip operator to circumvent a pattern detection limitation
 @tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
 def my_clip(x, a_min, a_max):
     """Unlike topi's current clip, put min and max into two stages."""
     const_min = tvm.tir.const(a_min, x.dtype)
     const_max = tvm.tir.const(a_max, x.dtype)
     x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
     x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
     return x


 def run_conv2d(env, remote, wl, target, check_correctness=True, print_ir=False, samples=4):

     # Workload assertions
     assert wl.hpad == wl.wpad

     # Perform packing only if we are targeting the accelerator
     if "arm_cpu" in target.keys:
         data_pack = False
         layout = "NCHW"
         conv2d_fcompute = topi.arm_cpu.conv2d_nchw_spatial_pack
         conv2d_fschedule = topi.arm_cpu.schedule_conv2d_nchw_spatial_pack
     elif "vta" in target.keys:
         data_pack = True
         layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
         conv2d_fcompute = vta.top.conv2d_packed
         conv2d_fschedule = vta.top.schedule_conv2d_packed

     # Derive shapes depending upon packing
     a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
     w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
     b_shape = (wl.batch, wl.out_filter, 1, 1)
     if data_pack:
         data_shape = (
             wl.batch // env.BATCH,
             wl.in_filter // env.BLOCK_IN,
             wl.height,
             wl.width,
             env.BATCH,
             env.BLOCK_IN,
         )
         kernel_shape = (
             wl.out_filter // env.BLOCK_OUT,
             wl.in_filter // env.BLOCK_IN,
             wl.hkernel,
             wl.wkernel,
             env.BLOCK_OUT,
             env.BLOCK_IN,
         )
         bias_shape = (
             wl.batch // env.BATCH,
             wl.out_filter // env.BLOCK_OUT,
             1,
             1,
             env.BATCH,
             env.BLOCK_OUT,
         )
     else:
         data_shape = a_shape
         kernel_shape = w_shape
         bias_shape = b_shape
     data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
     kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
     bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
     padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))

     # Define base computation schedule
     with target:
         if data_pack:
             res = conv2d_fcompute(
                 data, kernel, (wl.hstride, wl.wstride), padding, (1, 1), layout, env.acc_dtype
             )
         else:
             res = conv2d_fcompute(
                 data, kernel, (wl.hstride, wl.wstride), padding, (1, 1), env.acc_dtype
             )
         res = topi.right_shift(res, 8)
         res = topi.add(res, bias)
         res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
         res = topi.cast(res, env.out_dtype)
         # Derive base schedule
         s = conv2d_fschedule([res])
         if print_ir:
             print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))

     # Derive number of ops
     fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
     fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
     num_ops = (
         2
         * wl.batch
         * fout_height
         * fout_width
         * wl.hkernel
         * wl.wkernel
         * wl.out_filter
         * wl.in_filter
     )

     # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
     def get_ref_data():
         # derive min max for act, wgt, and bias types (max non inclusive)
         a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1))
         w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1))
         b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2), 1 << (
             env.INP_WIDTH + env.WGT_WIDTH - 2
         )
         a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
         w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype)
         b_np = np.random.randint(b_min, b_max, size=b_shape).astype(env.acc_dtype)
         r_np = tvm.topi.testing.conv2d_nchw_python(
             a_np.astype(env.acc_dtype),
             w_np.astype(env.acc_dtype),
             (wl.hstride, wl.wstride),
             wl.hpad,
         ).astype(env.acc_dtype)
         return a_np, w_np, b_np, r_np

     # Data in original format
     data_np, kernel_np, bias_np, res_ref = get_ref_data()
     if data_pack:
         data_np = data_np.reshape(
             wl.batch // env.BATCH,
             env.BATCH,
             wl.in_filter // env.BLOCK_IN,
             env.BLOCK_IN,
             wl.height,
             wl.width,
         ).transpose((0, 2, 4, 5, 1, 3))
         kernel_np = kernel_np.reshape(
             wl.out_filter // env.BLOCK_OUT,
             env.BLOCK_OUT,
             wl.in_filter // env.BLOCK_IN,
             env.BLOCK_IN,
             wl.hkernel,
             wl.wkernel,
         ).transpose((0, 2, 4, 5, 1, 3))
         bias_np = bias_np.reshape(
             wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT
         )

     # Build
     if "vta" in target.keys:
         mod = vta.build(
             s, [data, kernel, bias, res], target=target, target_host=env.target_host, name="conv2d"
         )
     else:
         mod = tvm.build(
             s, [data, kernel, bias, res], target=target, target_host=env.target_host, name="conv2d"
         )
     temp = utils.tempdir()
     mod.save(temp.relpath("conv2d.o"))
     remote.upload(temp.relpath("conv2d.o"))
     f = remote.load_module("conv2d.o")
     dev = remote.device(str(target))

     res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
     data_arr = tvm.nd.array(data_np, dev)
     kernel_arr = tvm.nd.array(kernel_np, dev)
     bias_arr = tvm.nd.array(bias_np, dev)
     res_arr = tvm.nd.array(res_np, dev)
     time_f = f.time_evaluator("conv2d", dev, number=samples)

     # In vta sim mode, collect simulator runtime statistics
     stats = {}
     cost = None
     if env.TARGET in ["sim", "tsim"]:
         # Check if we're in local RPC mode (allows us to rebuild the
         # runtime on the fly when varying the VTA designs)
         local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
         if local_rpc:
             if env.TARGET == "sim":
                 remote.get_function("vta.simulator.profiler_clear")()
             else:
                 remote.get_function("vta.tsim.profiler_clear")()
             cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
             if env.TARGET == "sim":
                 stats = json.loads(remote.get_function("vta.simulator.profiler_status")())
             else:
                 stats = json.loads(remote.get_function("vta.tsim.profiler_status")())
         else:
             simulator.clear_stats()
             cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
             stats = simulator.stats()
     else:
         cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)

     # Check correctness
     correct = False
     if check_correctness:
         res_orig = res_arr.asnumpy()
         if data_pack:
             res_orig = res_orig.transpose((0, 4, 1, 5, 2, 3)).reshape(
                 wl.batch, wl.out_filter, fout_height, fout_width
             )
             bias_np = bias_np.transpose((0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, 1, 1)
         res_ref = res_ref >> env.WGT_WIDTH
         res_ref += bias_np
         res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
         res_ref = res_ref.astype(env.out_dtype)
         correct = np.allclose(res_orig, res_ref)

     gops = (num_ops / cost.mean) / float(10 ** 9)
     status = "PASSED" if correct else "FAILED"
     if "arm_cpu" in target.keys:
         device = "CPU"
     elif "vta" in target.keys:
         device = "VTA"
     print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops))

     return correct, cost, stats


 @pytest.mark.parametrize("device", ["vta", "arm_cpu"])
 def test_conv2d(device):
     def _run(env, remote):
         if device == "vta":
             target = env.target
             if env.TARGET not in ["sim", "tsim", "intelfocl"]:
                 assert tvm.runtime.enabled("rpc")
                 program_fpga(remote, bitstream=None)
                 reconfig_runtime(remote)
         elif device == "arm_cpu":
             target = env.target_vta_cpu
         with autotvm.tophub.context(target):  # load pre-tuned schedule parameters
             for _, wl in resnet_wkls:
                 print(wl)
                 run_conv2d(env, remote, wl, target)

     vta.testing.run(_run)


 if __name__ == "__main__":
     test_conv2d(device="arm_cpu")
     test_conv2d(device="vta")
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	"""Testing topi conv2d operator for VTA"""

	import json
	import os

	import pytest
	import numpy as np
	from collections import namedtuple

	import tvm
	from tvm import te
	from tvm import relay
	from tvm import autotvm
	from tvm.contrib import utils
	from tvm.contrib.pickle_memoize import memoize
	from tvm import topi
	import tvm.topi.testing
	import vta
	from vta import program_fpga, reconfig_runtime
	import vta.testing
	from vta.testing import simulator


	Workload = namedtuple(
	"Conv2DWorkload",
	[
	"batch",
	"height",
	"width",
	"in_filter",
	"out_filter",
	"hkernel",
	"wkernel",
	"hpad",
	"wpad",
	"hstride",
	"wstride",
	],
	)

	# Get batch info from env
	env = vta.get_env()

	# ResNet18 workloads
	resnet_wkls = [
	# Workloads of resnet18 on imagenet
	# ('resnet-18.C1', Workload(env.BATCH, 224, 224, 3, 64, 7, 7, 3, 3, 2, 2)),
	("resnet-18.C2", Workload(env.BATCH, 56, 56, 64, 64, 3, 3, 1, 1, 1, 1)),
	("resnet-18.C3", Workload(env.BATCH, 56, 56, 64, 128, 3, 3, 1, 1, 2, 2)),
	("resnet-18.C4", Workload(env.BATCH, 56, 56, 64, 128, 1, 1, 0, 0, 2, 2)),
	("resnet-18.C5", Workload(env.BATCH, 28, 28, 128, 128, 3, 3, 1, 1, 1, 1)),
	("resnet-18.C6", Workload(env.BATCH, 28, 28, 128, 256, 3, 3, 1, 1, 2, 2)),
	("resnet-18.C7", Workload(env.BATCH, 28, 28, 128, 256, 1, 1, 0, 0, 2, 2)),
	("resnet-18.C8", Workload(env.BATCH, 14, 14, 256, 256, 3, 3, 1, 1, 1, 1)),
	("resnet-18.C9", Workload(env.BATCH, 14, 14, 256, 512, 3, 3, 1, 1, 2, 2)),
	("resnet-18.C10", Workload(env.BATCH, 14, 14, 256, 512, 1, 1, 0, 0, 2, 2)),
	("resnet-18.C11", Workload(env.BATCH, 7, 7, 512, 512, 3, 3, 1, 1, 1, 1)),
	]

	# FIXME: we need a custom clip operator to circumvent a pattern detection limitation
	@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
	def my_clip(x, a_min, a_max):
	"""Unlike topi's current clip, put min and max into two stages."""
	const_min = tvm.tir.const(a_min, x.dtype)
	const_max = tvm.tir.const(a_max, x.dtype)
	x = te.compute(x.shape, lambda i: tvm.te.min(x(i), const_max), name="clipA")
	x = te.compute(x.shape, lambda i: tvm.te.max(x(i), const_min), name="clipB")
	return x


	def run_conv2d(env, remote, wl, target, check_correctness=True, print_ir=False, samples=4):

	# Workload assertions
	assert wl.hpad == wl.wpad

	# Perform packing only if we are targeting the accelerator
	if "arm_cpu" in target.keys:
	data_pack = False
	layout = "NCHW"
	conv2d_fcompute = topi.arm_cpu.conv2d_nchw_spatial_pack
	conv2d_fschedule = topi.arm_cpu.schedule_conv2d_nchw_spatial_pack
	elif "vta" in target.keys:
	data_pack = True
	layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
	conv2d_fcompute = vta.top.conv2d_packed
	conv2d_fschedule = vta.top.schedule_conv2d_packed

	# Derive shapes depending upon packing
	a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
	w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
	b_shape = (wl.batch, wl.out_filter, 1, 1)
	if data_pack:
	data_shape = (
	wl.batch // env.BATCH,
	wl.in_filter // env.BLOCK_IN,
	wl.height,
	wl.width,
	env.BATCH,
	env.BLOCK_IN,
	)
	kernel_shape = (
	wl.out_filter // env.BLOCK_OUT,
	wl.in_filter // env.BLOCK_IN,
	wl.hkernel,
	wl.wkernel,
	env.BLOCK_OUT,
	env.BLOCK_IN,
	)
	bias_shape = (
	wl.batch // env.BATCH,
	wl.out_filter // env.BLOCK_OUT,
	1,
	1,
	env.BATCH,
	env.BLOCK_OUT,
	)
	else:
	data_shape = a_shape
	kernel_shape = w_shape
	bias_shape = b_shape
	data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
	kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
	bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
	padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))

	# Define base computation schedule
	with target:
	if data_pack:
	res = conv2d_fcompute(
	data, kernel, (wl.hstride, wl.wstride), padding, (1, 1), layout, env.acc_dtype
	)
	else:
	res = conv2d_fcompute(
	data, kernel, (wl.hstride, wl.wstride), padding, (1, 1), env.acc_dtype
	)
	res = topi.right_shift(res, 8)
	res = topi.add(res, bias)
	res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
	res = topi.cast(res, env.out_dtype)
	# Derive base schedule
	s = conv2d_fschedule([res])
	if print_ir:
	print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))

	# Derive number of ops
	fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
	fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
	num_ops = (
	2
	* wl.batch
	* fout_height
	* fout_width
	* wl.hkernel
	* wl.wkernel
	* wl.out_filter
	* wl.in_filter
	)

	# @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
	def get_ref_data():
	# derive min max for act, wgt, and bias types (max non inclusive)
	a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1))
	w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1))
	b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2), 1 << (
	env.INP_WIDTH + env.WGT_WIDTH - 2
	)
	a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
	w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype)
	b_np = np.random.randint(b_min, b_max, size=b_shape).astype(env.acc_dtype)
	r_np = tvm.topi.testing.conv2d_nchw_python(
	a_np.astype(env.acc_dtype),
	w_np.astype(env.acc_dtype),
	(wl.hstride, wl.wstride),
	wl.hpad,
	).astype(env.acc_dtype)
	return a_np, w_np, b_np, r_np

	# Data in original format
	data_np, kernel_np, bias_np, res_ref = get_ref_data()
	if data_pack:
	data_np = data_np.reshape(
	wl.batch // env.BATCH,
	env.BATCH,
	wl.in_filter // env.BLOCK_IN,
	env.BLOCK_IN,
	wl.height,
	wl.width,
	).transpose((0, 2, 4, 5, 1, 3))
	kernel_np = kernel_np.reshape(
	wl.out_filter // env.BLOCK_OUT,
	env.BLOCK_OUT,
	wl.in_filter // env.BLOCK_IN,
	env.BLOCK_IN,
	wl.hkernel,
	wl.wkernel,
	).transpose((0, 2, 4, 5, 1, 3))
	bias_np = bias_np.reshape(
	wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT
	)

	# Build
	if "vta" in target.keys:
	mod = vta.build(
	s, [data, kernel, bias, res], target=target, target_host=env.target_host, name="conv2d"
	)
	else:
	mod = tvm.build(
	s, [data, kernel, bias, res], target=target, target_host=env.target_host, name="conv2d"
	)
	temp = utils.tempdir()
	mod.save(temp.relpath("conv2d.o"))
	remote.upload(temp.relpath("conv2d.o"))
	f = remote.load_module("conv2d.o")
	dev = remote.device(str(target))

	res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
	data_arr = tvm.nd.array(data_np, dev)
	kernel_arr = tvm.nd.array(kernel_np, dev)
	bias_arr = tvm.nd.array(bias_np, dev)
	res_arr = tvm.nd.array(res_np, dev)
	time_f = f.time_evaluator("conv2d", dev, number=samples)

	# In vta sim mode, collect simulator runtime statistics
	stats = {}
	cost = None
	if env.TARGET in ["sim", "tsim"]:
	# Check if we're in local RPC mode (allows us to rebuild the
	# runtime on the fly when varying the VTA designs)
	local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
	if local_rpc:
	if env.TARGET == "sim":
	remote.get_function("vta.simulator.profiler_clear")()
	else:
	remote.get_function("vta.tsim.profiler_clear")()
	cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
	if env.TARGET == "sim":
	stats = json.loads(remote.get_function("vta.simulator.profiler_status")())
	else:
	stats = json.loads(remote.get_function("vta.tsim.profiler_status")())
	else:
	simulator.clear_stats()
	cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
	stats = simulator.stats()
	else:
	cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)

	# Check correctness
	correct = False
	if check_correctness:
	res_orig = res_arr.asnumpy()
	if data_pack:
	res_orig = res_orig.transpose((0, 4, 1, 5, 2, 3)).reshape(
	wl.batch, wl.out_filter, fout_height, fout_width
	)
	bias_np = bias_np.transpose((0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, 1, 1)
	res_ref = res_ref >> env.WGT_WIDTH
	res_ref += bias_np
	res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
	res_ref = res_ref.astype(env.out_dtype)
	correct = np.allclose(res_orig, res_ref)

	gops = (num_ops / cost.mean) / float(10 ** 9)
	status = "PASSED" if correct else "FAILED"
	if "arm_cpu" in target.keys:
	device = "CPU"
	elif "vta" in target.keys:
	device = "VTA"
	print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops))

	return correct, cost, stats


	@pytest.mark.parametrize("device", ["vta", "arm_cpu"])
	def test_conv2d(device):
	def _run(env, remote):
	if device == "vta":
	target = env.target
	if env.TARGET not in ["sim", "tsim", "intelfocl"]:
	assert tvm.runtime.enabled("rpc")
	program_fpga(remote, bitstream=None)
	reconfig_runtime(remote)
	elif device == "arm_cpu":
	target = env.target_vta_cpu
	with autotvm.tophub.context(target): # load pre-tuned schedule parameters
	for _, wl in resnet_wkls:
	print(wl)
	run_conv2d(env, remote, wl, target)

	vta.testing.run(_run)


	if __name__ == "__main__":
	test_conv2d(device="arm_cpu")
	test_conv2d(device="vta")