blob: 289e69ab9b5bfdfd55aee0374796b5810ce56a69 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable-msg=too-many-arguments, too-many-locals, assignment-from-no-return
""" Conv Int8 functional and performance testing"""
import sys
import logging
import numpy as np
import tvm
from tvm import te
from tvm import topi
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
LOGGER = logging.getLogger("test_conv_int8_intel")
LOGGER.disabled = False
# All the WORKLOADS from Resnet except first layer
# Workload is ['height', 'width', 'in_filter', 'out_filter',
# 'hkernel', 'wkernel', 'hpad', 'wpad', 'hstride', 'wstride'])
WORKLOADS = [
(56, 56, 64, 64, 3, 3, 1, 1, 1, 1),
(56, 56, 64, 64, 1, 1, 0, 0, 1, 1),
(56, 56, 64, 128, 3, 3, 1, 1, 2, 2),
(56, 56, 64, 128, 1, 1, 0, 0, 2, 2),
(28, 28, 128, 128, 3, 3, 1, 1, 1, 1),
(28, 28, 128, 256, 3, 3, 1, 1, 2, 2),
(28, 28, 128, 256, 1, 1, 0, 0, 2, 2),
(14, 14, 256, 256, 3, 3, 1, 1, 1, 1),
(14, 14, 256, 512, 3, 3, 1, 1, 2, 2),
(14, 14, 256, 512, 1, 1, 0, 0, 2, 2),
(7, 7, 512, 512, 3, 3, 1, 1, 1, 1),
(56, 56, 64, 256, 1, 1, 0, 0, 1, 1),
(56, 56, 256, 64, 1, 1, 0, 0, 1, 1),
(56, 56, 256, 128, 1, 1, 0, 0, 2, 2),
(28, 28, 128, 512, 1, 1, 0, 0, 1, 1),
(56, 56, 256, 512, 1, 1, 0, 0, 2, 2),
(28, 28, 512, 128, 1, 1, 0, 0, 1, 1),
(28, 28, 512, 256, 1, 1, 0, 0, 2, 2),
(14, 14, 256, 1024, 1, 1, 0, 0, 1, 1),
(28, 28, 512, 1024, 1, 1, 0, 0, 2, 2),
(14, 14, 1024, 256, 1, 1, 0, 0, 1, 1),
(14, 14, 1024, 512, 1, 1, 0, 0, 2, 2),
(7, 7, 512, 2048, 1, 1, 0, 0, 1, 1),
(14, 14, 1024, 2048, 1, 1, 0, 0, 2, 2),
(7, 7, 2048, 512, 1, 1, 0, 0, 1, 1),
]
TARGET_NAME = "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+v8.2a,+dotprod"
NUM_VEC_LANES = 16
CTX = tvm.context(TARGET_NAME, 0)
def get_shape(
im_height, im_width, in_filter, out_filter, k_h, k_w, hpad, wpad, hstride, wstride, out_dtype
):
"""
Finds out the shape of all data structures
"""
data_shape = (1, in_filter // NUM_VEC_LANES, im_height, im_width, NUM_VEC_LANES)
if out_dtype == "int32" or out_dtype == "uint32":
kernel_shape = (
out_filter // NUM_VEC_LANES,
in_filter // NUM_VEC_LANES,
k_h,
k_w,
NUM_VEC_LANES // 4,
NUM_VEC_LANES,
4,
)
elif out_dtype == "float32":
kernel_shape = (
out_filter // NUM_VEC_LANES,
in_filter // NUM_VEC_LANES,
k_h,
k_w,
NUM_VEC_LANES,
NUM_VEC_LANES,
)
out_height = (im_height + 2 * hpad - k_h) // hstride + 1
out_width = (im_width + 2 * wpad - k_w) // wstride + 1
o_shape = (1, out_filter // NUM_VEC_LANES, out_height, out_width, NUM_VEC_LANES)
return (data_shape, kernel_shape, o_shape)
def run_inference(
data_dtype,
kernel_dtype,
out_dtype,
im_height,
im_width,
in_filter,
out_filter,
k_h,
k_w,
hpad,
wpad,
hstride,
wstride,
):
"""
Runs the inference and checks the functional correctness between
compute and schedule outputs
"""
(data_shape, kernel_shape, o_shape) = get_shape(
im_height,
im_width,
in_filter,
out_filter,
k_h,
k_w,
hpad,
wpad,
hstride,
wstride,
out_dtype,
)
# Create TVM placeholders
data = te.placeholder(data_shape, name="data", dtype=data_dtype)
kernel = te.placeholder(kernel_shape, name="kernel", dtype=kernel_dtype)
# Create the numpy arrays to be used for executing conv models
if data_dtype == "float32":
data_array = tvm.nd.array(np.random.rand(*data_shape).astype(dtype=data_dtype), CTX)
kernel_array = tvm.nd.array(np.random.rand(*kernel_shape).astype(dtype=kernel_dtype), CTX)
else:
data_array = tvm.nd.array(np.random.randint(100, size=data_shape).astype(data_dtype))
kernel_array = tvm.nd.array(np.random.randint(100, size=kernel_shape).astype(kernel_dtype))
# c_orig will be used for declaration ouptut
# c_sch will be used for scheduled computation output
c_orig = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)
c_sch = tvm.nd.array(np.zeros(o_shape, dtype=out_dtype), CTX)
with tvm.target.Target(TARGET_NAME):
if out_dtype == "float32":
conv = topi.nn.conv2d_NCHWc(
data,
kernel,
stride=hstride,
padding=hpad,
dilation=(1, 1),
layout="NCHWc",
out_layout="NCHWc",
out_dtype=out_dtype,
)
else:
conv = topi.nn.conv2d_NCHWc_int8(
data,
kernel,
strides=hstride,
padding=hpad,
dilation=(1, 1),
layout="NCHWc",
out_layout="NCHWc",
out_dtype=out_dtype,
)
out = topi.nn.relu(conv)
sch = te.create_schedule(out.op)
func = tvm.build(sch, [data, kernel, out], target=TARGET_NAME, name="out")
func(data_array, kernel_array, c_orig)
LOGGER.debug(tvm.lower(sch, [data, kernel], simple_mode=True))
# Generate and run the optimized schedule
if out_dtype == "float32":
sconv = topi.generic.nn.schedule_conv2d_NCHWc(outs=[out])
else:
sconv = topi.generic.nn.schedule_conv2d_NCHWc_int8(outs=[out])
func = tvm.build(sconv, [data, kernel, out], target=TARGET_NAME, name="conv")
func(data_array, kernel_array, c_sch)
# Functional check
if data_dtype == "uint8":
np.testing.assert_equal(c_orig.asnumpy(), c_sch.asnumpy())
else:
assert np.allclose(c_orig.asnumpy(), c_sch.asnumpy())
evaluator = func.time_evaluator(func.entry_name, CTX, number=1000)
LOGGER.debug(tvm.lower(sconv, [data, kernel], simple_mode=True))
return evaluator(data_array, kernel_array, c_sch).mean
if __name__ == "__main__":
LOGGER.info("Workload, Kernel_size, FP32_time, INT8_time, Speedup")
SPEEDUP_ARRAY = []
for i, wkl in enumerate(WORKLOADS):
for dtype in ["uint", "int"]:
fp32_time = run_inference("float32", "float32", "float32", *wkl)
int8_time = run_inference("%s8" % dtype, "%s8" % dtype, "%s32" % dtype, *wkl)
kernel_h = wkl[4]
kernel_w = wkl[5]
LOGGER.info(
"[%s] Workload#" % dtype
+ str(i)
+ ", "
+ str(kernel_h)
+ "x"
+ str(kernel_w)
+ ", "
+ str(fp32_time)
+ ", "
+ str(int8_time)
+ ", "
+ str(fp32_time / int8_time)
)
SPEEDUP_ARRAY.append(fp32_time / int8_time)
LOGGER.info("Average speedup --> %s" % str(sum(SPEEDUP_ARRAY) / float(len(SPEEDUP_ARRAY))))