apps/topi_recipe/conv/test_conv_int8_arm.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable-msg=too-many-arguments, too-many-locals, assignment-from-no-return
 """ Conv Int8 functional and performance testing"""
 import sys
 import logging
 import numpy as np
 import tvm
 from tvm import te
 from tvm import topi

 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 LOGGER = logging.getLogger("test_conv_int8_intel")
 LOGGER.disabled = False

 # All the WORKLOADS from Resnet except first layer
 # Workload is ['height', 'width', 'in_filter', 'out_filter',
 #              'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
 WORKLOADS = [
     (56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
     (56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
     (56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
     (56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
     (28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
     (28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
     (28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
     (14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
     (14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
     (14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
     (7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
     (56, 56, 64, 256, 1, 1, 0, 0, 1, 1),
     (56, 56, 256, 64, 1, 1, 0, 0, 1, 1),
     (56, 56, 256, 128, 1, 1, 0, 0, 2, 2),
     (28, 28, 128, 512, 1, 1, 0, 0, 1, 1),
     (56, 56, 256, 512, 1, 1, 0, 0, 2, 2),
     (28, 28, 512, 128, 1, 1, 0, 0, 1, 1),
     (28, 28, 512, 256, 1, 1, 0, 0, 2, 2),
     (14, 14, 256, 1024, 1, 1, 0, 0, 1, 1),
     (28, 28, 512, 1024, 1, 1, 0, 0, 2, 2),
     (14, 14, 1024, 256, 1, 1, 0, 0, 1, 1),
     (14, 14, 1024, 512, 1, 1, 0, 0, 2, 2),
     (7, 7, 512, 2048, 1, 1, 0, 0, 1, 1),
     (14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2),
     (7, 7, 2048, 512, 1, 1, 0, 0, 1, 1),
 ]


 TARGET_NAME = "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+v8.2a,+dotprod"
 NUM_VEC_LANES = 16
 CTX = tvm.context(TARGET_NAME, 0)


 def get_shape(
     im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad, hstride, wstride, out_dtype
 ):
     """
     Finds out the shape of all data structures
     """
     data_shape = (1, in_filter // NUM_VEC_LANES, im_height, im_width, NUM_VEC_LANES)

     if out_dtype == "int32" or out_dtype == "uint32":
         kernel_shape = (
             out_filter // NUM_VEC_LANES,
             in_filter // NUM_VEC_LANES,
             k_h,
             k_w,
             NUM_VEC_LANES // 4,
             NUM_VEC_LANES,
             4,
         )
     elif out_dtype == "float32":
         kernel_shape = (
             out_filter // NUM_VEC_LANES,
             in_filter // NUM_VEC_LANES,
             k_h,
             k_w,
             NUM_VEC_LANES,
             NUM_VEC_LANES,
         )
     out_height = (im_height + 2 * hpad - k_h) // hstride + 1
     out_width = (im_width + 2 * wpad - k_w) // wstride + 1
     o_shape = (1, out_filter // NUM_VEC_LANES, out_height, out_width, NUM_VEC_LANES)
     return (data_shape, kernel_shape, o_shape)


 def run_inference(
     data_dtype,
     kernel_dtype,
     out_dtype,
     im_height,
     im_width,
     in_filter,
     out_filter,
     k_h,
     k_w,
     hpad,
     wpad,
     hstride,
     wstride,
 ):
     """
     Runs the inference and checks the functional correctness between
     compute and schedule outputs
     """
     (data_shape, kernel_shape, o_shape) = get_shape(
         im_height,
         im_width,
         in_filter,
         out_filter,
         k_h,
         k_w,
         hpad,
         wpad,
         hstride,
         wstride,
         out_dtype,
     )

     # Create TVM placeholders
     data = te.placeholder(data_shape, name="data", dtype=data_dtype)
     kernel = te.placeholder(kernel_shape, name="kernel", dtype=kernel_dtype)

     # Create the numpy arrays to be used for executing conv models
     if data_dtype == "float32":
         data_array = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), CTX)
         kernel_array = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), CTX)
     else:
         data_array = tvm.nd.array(np.random.randint(100, size=data_shape).astype(data_dtype))
         kernel_array = tvm.nd.array(np.random.randint(100, size=kernel_shape).astype(kernel_dtype))

     # c_orig will be used for declaration ouptut
     # c_sch will be used for scheduled computation output
     c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)
     c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)

     with tvm.target.Target(TARGET_NAME):
         if out_dtype == "float32":
             conv = topi.nn.conv2d_NCHWc(
                 data,
                 kernel,
                 stride=hstride,
                 padding=hpad,
                 dilation=(1, 1),
                 layout="NCHWc",
                 out_layout="NCHWc",
                 out_dtype=out_dtype,
             )
         else:
             conv = topi.nn.conv2d_NCHWc_int8(
                 data,
                 kernel,
                 strides=hstride,
                 padding=hpad,
                 dilation=(1, 1),
                 layout="NCHWc",
                 out_layout="NCHWc",
                 out_dtype=out_dtype,
             )
         out = topi.nn.relu(conv)
         sch = te.create_schedule(out.op)
         func = tvm.build(sch, [data, kernel, out], target=TARGET_NAME, name="out")
         func(data_array, kernel_array, c_orig)
         LOGGER.debug(tvm.lower(sch, [data, kernel], simple_mode=True))

         # Generate and run the optimized schedule
         if out_dtype == "float32":
             sconv = topi.generic.nn.schedule_conv2d_NCHWc(outs=[out])
         else:
             sconv = topi.generic.nn.schedule_conv2d_NCHWc_int8(outs=[out])
         func = tvm.build(sconv, [data, kernel, out], target=TARGET_NAME, name="conv")
         func(data_array, kernel_array, c_sch)

         # Functional check
         if data_dtype == "uint8":
             np.testing.assert_equal(c_orig.asnumpy(), c_sch.asnumpy())
         else:
             assert np.allclose(c_orig.asnumpy(), c_sch.asnumpy())

         evaluator = func.time_evaluator(func.entry_name, CTX, number=1000)
         LOGGER.debug(tvm.lower(sconv, [data, kernel], simple_mode=True))
         return evaluator(data_array, kernel_array, c_sch).mean


 if __name__ == "__main__":
     LOGGER.info("Workload, Kernel_size, FP32_time, INT8_time, Speedup")
     SPEEDUP_ARRAY = []
     for i, wkl in enumerate(WORKLOADS):
         for dtype in ["uint", "int"]:
             fp32_time = run_inference("float32", "float32", "float32", *wkl)
             int8_time = run_inference("%s8" % dtype, "%s8" % dtype, "%s32" % dtype, *wkl)
             kernel_h = wkl[4]
             kernel_w = wkl[5]
             LOGGER.info(
                 "[%s] Workload#" % dtype
                 + str(i)
                 + ", "
                 + str(kernel_h)
                 + "x"
                 + str(kernel_w)
                 + ", "
                 + str(fp32_time)
                 + ", "
                 + str(int8_time)
                 + ", "
                 + str(fp32_time / int8_time)
             )

             SPEEDUP_ARRAY.append(fp32_time / int8_time)
     LOGGER.info("Average speedup --> %s" % str(sum(SPEEDUP_ARRAY) / float(len(SPEEDUP_ARRAY))))
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	# pylint: disable-msg=too-many-arguments, too-many-locals, assignment-from-no-return
	""" Conv Int8 functional and performance testing"""
	import sys
	import logging
	import numpy as np
	import tvm
	from tvm import te
	from tvm import topi

	logging.basicConfig(stream=sys.stdout, level=logging.INFO)
	LOGGER = logging.getLogger("test_conv_int8_intel")
	LOGGER.disabled = False

	# All the WORKLOADS from Resnet except first layer
	# Workload is ['height', 'width', 'in_filter', 'out_filter',
	# 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
	WORKLOADS = [
	(56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
	(56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
	(56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
	(56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
	(28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
	(28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
	(28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
	(14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
	(14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
	(14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
	(7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
	(56, 56, 64, 256, 1, 1, 0, 0, 1, 1),
	(56, 56, 256, 64, 1, 1, 0, 0, 1, 1),
	(56, 56, 256, 128, 1, 1, 0, 0, 2, 2),
	(28, 28, 128, 512, 1, 1, 0, 0, 1, 1),
	(56, 56, 256, 512, 1, 1, 0, 0, 2, 2),
	(28, 28, 512, 128, 1, 1, 0, 0, 1, 1),
	(28, 28, 512, 256, 1, 1, 0, 0, 2, 2),
	(14, 14, 256, 1024, 1, 1, 0, 0, 1, 1),
	(28, 28, 512, 1024, 1, 1, 0, 0, 2, 2),
	(14, 14, 1024, 256, 1, 1, 0, 0, 1, 1),
	(14, 14, 1024, 512, 1, 1, 0, 0, 2, 2),
	(7, 7, 512, 2048, 1, 1, 0, 0, 1, 1),
	(14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2),
	(7, 7, 2048, 512, 1, 1, 0, 0, 1, 1),
	]


	TARGET_NAME = "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+v8.2a,+dotprod"
	NUM_VEC_LANES = 16
	CTX = tvm.context(TARGET_NAME, 0)


	def get_shape(
	im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad, hstride, wstride, out_dtype
	):
	"""
	Finds out the shape of all data structures
	"""
	data_shape = (1, in_filter // NUM_VEC_LANES, im_height, im_width, NUM_VEC_LANES)

	if out_dtype == "int32" or out_dtype == "uint32":
	kernel_shape = (
	out_filter // NUM_VEC_LANES,
	in_filter // NUM_VEC_LANES,
	k_h,
	k_w,
	NUM_VEC_LANES // 4,
	NUM_VEC_LANES,
	4,
	)
	elif out_dtype == "float32":
	kernel_shape = (
	out_filter // NUM_VEC_LANES,
	in_filter // NUM_VEC_LANES,
	k_h,
	k_w,
	NUM_VEC_LANES,
	NUM_VEC_LANES,
	)
	out_height = (im_height + 2 * hpad - k_h) // hstride + 1
	out_width = (im_width + 2 * wpad - k_w) // wstride + 1
	o_shape = (1, out_filter // NUM_VEC_LANES, out_height, out_width, NUM_VEC_LANES)
	return (data_shape, kernel_shape, o_shape)


	def run_inference(
	data_dtype,
	kernel_dtype,
	out_dtype,
	im_height,
	im_width,
	in_filter,
	out_filter,
	k_h,
	k_w,
	hpad,
	wpad,
	hstride,
	wstride,
	):
	"""
	Runs the inference and checks the functional correctness between
	compute and schedule outputs
	"""
	(data_shape, kernel_shape, o_shape) = get_shape(
	im_height,
	im_width,
	in_filter,
	out_filter,
	k_h,
	k_w,
	hpad,
	wpad,
	hstride,
	wstride,
	out_dtype,
	)

	# Create TVM placeholders
	data = te.placeholder(data_shape, name="data", dtype=data_dtype)
	kernel = te.placeholder(kernel_shape, name="kernel", dtype=kernel_dtype)

	# Create the numpy arrays to be used for executing conv models
	if data_dtype == "float32":
	data_array = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), CTX)
	kernel_array = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), CTX)
	else:
	data_array = tvm.nd.array(np.random.randint(100, size=data_shape).astype(data_dtype))
	kernel_array = tvm.nd.array(np.random.randint(100, size=kernel_shape).astype(kernel_dtype))

	# c_orig will be used for declaration ouptut
	# c_sch will be used for scheduled computation output
	c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)
	c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)

	with tvm.target.Target(TARGET_NAME):
	if out_dtype == "float32":
	conv = topi.nn.conv2d_NCHWc(
	data,
	kernel,
	stride=hstride,
	padding=hpad,
	dilation=(1, 1),
	layout="NCHWc",
	out_layout="NCHWc",
	out_dtype=out_dtype,
	)
	else:
	conv = topi.nn.conv2d_NCHWc_int8(
	data,
	kernel,
	strides=hstride,
	padding=hpad,
	dilation=(1, 1),
	layout="NCHWc",
	out_layout="NCHWc",
	out_dtype=out_dtype,
	)
	out = topi.nn.relu(conv)
	sch = te.create_schedule(out.op)
	func = tvm.build(sch, [data, kernel, out], target=TARGET_NAME, name="out")
	func(data_array, kernel_array, c_orig)
	LOGGER.debug(tvm.lower(sch, [data, kernel], simple_mode=True))

	# Generate and run the optimized schedule
	if out_dtype == "float32":
	sconv = topi.generic.nn.schedule_conv2d_NCHWc(outs=[out])
	else:
	sconv = topi.generic.nn.schedule_conv2d_NCHWc_int8(outs=[out])
	func = tvm.build(sconv, [data, kernel, out], target=TARGET_NAME, name="conv")
	func(data_array, kernel_array, c_sch)

	# Functional check
	if data_dtype == "uint8":
	np.testing.assert_equal(c_orig.asnumpy(), c_sch.asnumpy())
	else:
	assert np.allclose(c_orig.asnumpy(), c_sch.asnumpy())

	evaluator = func.time_evaluator(func.entry_name, CTX, number=1000)
	LOGGER.debug(tvm.lower(sconv, [data, kernel], simple_mode=True))
	return evaluator(data_array, kernel_array, c_sch).mean


	if __name__ == "__main__":
	LOGGER.info("Workload, Kernel_size, FP32_time, INT8_time, Speedup")
	SPEEDUP_ARRAY = []
	for i, wkl in enumerate(WORKLOADS):
	for dtype in ["uint", "int"]:
	fp32_time = run_inference("float32", "float32", "float32", *wkl)
	int8_time = run_inference("%s8" % dtype, "%s8" % dtype, "%s32" % dtype, *wkl)
	kernel_h = wkl[4]
	kernel_w = wkl[5]
	LOGGER.info(
	"[%s] Workload#" % dtype
	+ str(i)
	+ ", "
	+ str(kernel_h)
	+ "x"
	+ str(kernel_w)
	+ ", "
	+ str(fp32_time)
	+ ", "
	+ str(int8_time)
	+ ", "
	+ str(fp32_time / int8_time)
	)

	SPEEDUP_ARRAY.append(fp32_time / int8_time)
	LOGGER.info("Average speedup --> %s" % str(sum(SPEEDUP_ARRAY) / float(len(SPEEDUP_ARRAY))))