python/tvm/topi/arm_cpu/conv2d_alter_op.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name,unused-variable,unused-argument,no-member
 """Conv2D alter op and legalize functions for arm cpu"""

 import logging

 import tvm
 from tvm import te
 from tvm import relay
 from tvm import autotvm

 from ..nn import conv2d_alter_layout, conv2d_legalize
 from ..utils import get_const_tuple
 from ..x86.conv2d import _get_default_config as _get_x86_default_config
 from ..x86.conv2d_int8 import _get_default_config_int8
 from .conv2d_int8 import is_int8_hw_support
 from .arm_utils import get_tiling_B_interleaved_t
 from ..generic.conv2d import conv2d_alter_int8_common

 logger = logging.getLogger("topi")


 def interleave_transpose_weights(inputs, data, kernel, interleave_A):
     """Transform the weight matrix by reshaping, interleaving and transposing it

     Parameters
     ----------
     inputs : tvm.relay.Expr
         Grouped input symbols
     data :
         Input shape and dtype
     kernel :
         Input shape and dtype
     interleave_A: indicates if we expect matrix A to be interleaved

     Returns
     ----------
     new_kernel : tvm.te.placeholder
                  A placeholder with the new shape
     new_kernel_expr : tvm.relay.Expr
                 The relay expression of the weights
     """
     assert (
         data.dtype == "int8"
         and kernel.dtype == "int8"
         or data.dtype == "uint8"
         and kernel.dtype == "uint8"
     )

     KH, KW, IC, OC = get_const_tuple(kernel.shape)
     K = KH * KW * IC
     N = OC

     # Get tiling information for the interleaved transposed version of B
     tile_rows_B, tile_cols_B = get_tiling_B_interleaved_t(interleave_A)

     pad_K = 0
     pad_N = 0

     if N % tile_rows_B != 0:
         pad_N = tile_rows_B - (N % tile_rows_B)
     if K % tile_cols_B != 0:
         pad_K = tile_cols_B - (K % tile_cols_B)

     N_padded = N + pad_N
     K_padded = K + pad_K
     new_kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform(
         inputs[1], tile_rows_B, tile_cols_B
     )
     new_kernel = te.placeholder(
         (N_padded // tile_rows_B, K_padded // tile_cols_B, tile_rows_B, tile_cols_B), kernel.dtype
     )
     return new_kernel, new_kernel_expr


 @conv2d_alter_layout.register(["arm_cpu"])
 def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     target = tvm.target.Target.current(allow_none=False)
     dispatch_ctx = autotvm.task.DispatchContext.current

     _, outs = relay.backend.te_compiler.select_implementation(
         relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
     )
     workload = autotvm.task.get_workload(outs)
     if workload is None:
         # The best implementation is not an AutoTVM template,
         # we then assume it's not necessary to alter this op.
         return None
     cfg = dispatch_ctx.query(target, workload)

     topi_tmpl = workload[0]
     new_attrs = {k: attrs[k] for k in attrs.keys()}

     strides = attrs.get_int_tuple("strides")
     padding = attrs.get_int_tuple("padding")
     dilation = attrs.get_int_tuple("dilation")
     data_layout = attrs["data_layout"]
     kernel_layout = attrs["kernel_layout"]
     data, kernel = tinfos
     out_dtype = out_type.dtype

     # Extract data types
     data_tensor, kernel_tensor = tinfos
     data_dtype = data_tensor.dtype
     kernel_dtype = kernel_tensor.dtype

     idxd = tvm.tir.indexdiv

     # We don't perform layout alteration for NHWC layout with real data types
     if data_layout == "NHWC" and data_dtype not in ["uint8", "int8"]:
         return None

     if topi_tmpl == "conv2d_nchw_spatial_pack.arm_cpu":
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
         N, CI, H, W = get_const_tuple(data.shape)
         CO, _, KH, KW = get_const_tuple(kernel.shape)
         VC = cfg["tile_co"].size[-1]

         new_attrs["kernel_layout"] = "OIHW%do" % VC

         new_data = data
         new_kernel = te.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, out_dtype],
             "conv2d_nchw_spatial_pack.arm_cpu",
         )
         dispatch_ctx.update(target, new_workload, cfg)

         return relay.nn.conv2d(*inputs, **new_attrs)

     if topi_tmpl == "conv2d_nhwc_spatial_pack.arm_cpu":
         assert (
             data.dtype == "int8"
             and kernel.dtype == "int8"
             or data.dtype == "uint8"
             and kernel.dtype == "uint8"
         )

         assert data_layout == "NHWC" and kernel_layout == "HWIO"

         data_expr, kernel_expr = inputs

         data_int16 = relay.cast(data_expr, dtype="int16")
         kernel_int16 = relay.cast(kernel_expr, dtype="int16")

         new_attrs = {k: attrs[k] for k in attrs.keys()}

         new_data = te.placeholder(data.shape, "int16")
         new_kernel = te.placeholder(kernel.shape, "int16")

         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, out_dtype],
             "conv2d_nhwc_spatial_pack.arm_cpu",
         )
         dispatch_ctx.update(target, new_workload, cfg)

         return relay.nn.conv2d(data_int16, kernel_int16, **new_attrs)

     if topi_tmpl == "conv2d_nchw_winograd.arm_cpu":
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
         N, CI, H, W = get_const_tuple(data.shape)
         CO, _, KH, KW = get_const_tuple(kernel.shape)
         VC = cfg["tile_k"].size[-1]
         tile_size = 4

         weight_expr = inputs[1]
         weight_expr = relay.nn.contrib_conv2d_winograd_weight_transform(
             weight_expr, tile_size=tile_size
         )
         weight_expr = relay.reshape(
             weight_expr, newshape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI)
         )
         weight_expr = relay.transpose(weight_expr, axes=[0, 1, 2, 4, 3])

         new_attrs["tile_size"] = tile_size
         new_attrs["channels"] = CO

         new_data = data
         new_kernel = te.placeholder(
             (KH + tile_size - 1, KW + tile_size - 1, idxd(CO, VC), CI, VC), kernel.dtype
         )
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, out_dtype],
             "conv2d_nchw_winograd.arm_cpu",
         )
         dispatch_ctx.update(target, new_workload, cfg)

         return relay.nn.contrib_conv2d_winograd_without_weight_transform(
             inputs[0], weight_expr, **new_attrs
         )

     if topi_tmpl == "conv2d_nchw_winograd_nnpack.arm_cpu":
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
         N, CI, H, W = get_const_tuple(data.shape)
         CO, _, KH, KW = get_const_tuple(kernel.shape)
         new_attrs["channels"] = CO

         # pre-compute winograd_nnpack transform
         # for winograd_nnpack_fp16, the precompute prune pass must run on device,
         # where float16 is supported
         weight_dtype = "float32"
         weight_expr = inputs[1]
         transformed_weight = relay.nn.contrib_conv2d_winograd_nnpack_weight_transform(
             weight_expr,
             convolution_algorithm=cfg["winograd_nnpack_algorithm"].val,
             out_dtype=weight_dtype,
         )

         new_data = data
         new_kernel = te.placeholder((CO, CI, 8, 8), "float32")

         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, None, strides, padding, dilation, out_dtype],
             "conv2d_nchw_winograd_nnpack_without_weight_transform.arm_cpu",
         )
         dispatch_ctx.update(target, new_workload, cfg)
         return relay.nn.contrib_conv2d_winograd_without_weight_transform(
             inputs[0], transformed_weight, **new_attrs
         )

     if topi_tmpl == "depthwise_conv2d_nchw_spatial_pack.arm_cpu":
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
         N, CI, H, W = get_const_tuple(data.shape)
         CO, M, KH, KW = get_const_tuple(kernel.shape)
         VC = cfg["tile_co"].size[-1]

         new_attrs["kernel_layout"] = "OIHW%do" % (cfg["tile_co"].size[-1])

         # Store the same config for the altered operator (workload)
         new_data = data
         new_kernel = te.placeholder((idxd(CO, VC), M, KH, KW, VC), dtype=kernel.dtype)
         new_workload = autotvm.task.args_to_workload(
             [new_data, new_kernel, strides, padding, dilation, out_dtype],
             "depthwise_conv2d_nchw_spatial_pack.arm_cpu",
         )
         dispatch_ctx.update(target, new_workload, cfg)

         return relay.nn.conv2d(*inputs, **new_attrs)

     if topi_tmpl == "conv2d_NCHWc.x86":
         # Converting NCHW to NCHWc.
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
         if cfg.is_fallback:
             _get_x86_default_config(
                 cfg,
                 data_tensor,
                 kernel_tensor,
                 strides,
                 padding,
                 dilation,
                 out_dtype,
                 False,
                 data_layout,
             )
         batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
         out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
         ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]

         # update new attrs
         new_attrs["channels"] = out_channel
         new_attrs["data_layout"] = "NCHW%dc" % ic_bn
         # (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
         new_attrs["kernel_layout"] = "OIHW%di%do" % (ic_bn, oc_bn)
         new_attrs["out_layout"] = "NCHW%dc" % oc_bn

         # Store altered operator's config
         new_data = te.placeholder(
             (batch_size, in_channel // ic_bn, height, width, ic_bn), dtype=data_dtype
         )
         new_kernel = te.placeholder(
             (out_channel // oc_bn, in_channel // ic_bn, kh, kw, ic_bn, oc_bn),
             dtype=kernel_tensor.dtype,
         )
         new_workload = autotvm.task.args_to_workload(
             [
                 new_data,
                 new_kernel,
                 strides,
                 padding,
                 dilation,
                 new_attrs["data_layout"],
                 new_attrs["out_layout"],
                 out_dtype,
             ],
             topi_tmpl,
         )
         dispatch_ctx.update(target, new_workload, cfg)
         return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs)

     if topi_tmpl == "depthwise_conv2d_NCHWc.x86":
         # Converting NCHW to NCHWc.
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
         if cfg.is_fallback:
             _get_x86_default_config(
                 cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, True, data_layout
             )

         batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
         out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape)
         ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
         assert channel_multiplier == 1

         # update new attrs
         new_attrs["channels"] = out_channel
         new_attrs["data_layout"] = "NCHW%dc" % ic_bn
         new_attrs["kernel_layout"] = "OIHW1i%do" % oc_bn
         new_attrs["out_layout"] = "NCHW%dc" % oc_bn

         # Store altered operator's config.
         new_data = te.placeholder(
             (batch_size, in_channel // ic_bn, height, width, ic_bn), dtype=data_dtype
         )
         new_kernel = te.placeholder((out_channel // oc_bn, 1, kh, kw, 1, oc_bn), dtype=kernel_dtype)
         new_workload = autotvm.task.args_to_workload(
             [
                 new_data,
                 new_kernel,
                 strides,
                 padding,
                 dilation,
                 new_attrs["data_layout"],
                 new_attrs["out_layout"],
                 out_dtype,
             ],
             topi_tmpl,
         )
         dispatch_ctx.update(target, new_workload, cfg)
         return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs)

     if topi_tmpl == "conv2d_NCHWc_int8.arm_cpu":
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
         batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
         out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)

         n_elems = 4

         if cfg.is_fallback:
             _get_default_config_int8(
                 cfg,
                 data_tensor,
                 kernel_tensor,
                 strides,
                 padding,
                 dilation,
                 out_dtype,
                 False,
                 data_layout,
                 int32_lanes=4,
             )

         ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]

         if cfg.is_fallback:
             # ic_bn needs to be divided by n_elems below
             ic_bn = max(ic_bn, n_elems)

         # update new attrs
         new_attrs["channels"] = out_channel
         new_attrs["data_layout"] = "NCHW%dc" % ic_bn
         new_attrs["kernel_layout"] = "OIHW{:n}i{:n}o{:n}i".format(ic_bn // n_elems, oc_bn, n_elems)
         new_attrs["out_layout"] = "NCHW%dc" % oc_bn

         # Store altered operator's config.
         new_data = te.placeholder(
             (batch_size, in_channel // ic_bn, height, width, ic_bn), dtype=data_dtype
         )
         new_kernel = te.placeholder(
             (out_channel // oc_bn, in_channel // ic_bn, kh, kw, ic_bn // n_elems, oc_bn, n_elems),
             dtype=kernel_dtype,
         )
         new_workload = autotvm.task.args_to_workload(
             [
                 new_data,
                 new_kernel,
                 strides,
                 padding,
                 dilation,
                 new_attrs["data_layout"],
                 new_attrs["out_layout"],
                 out_dtype,
             ],
             topi_tmpl,
         )
         dispatch_ctx.update(target, new_workload, cfg)
         return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs)

     if topi_tmpl == "conv2d_NHWC_quantized_interleaved.arm_cpu":
         # TODO(masahi): This schedule can easily result in a tensorization error
         # if used in the fallback mode
         if cfg.is_fallback:  # if is fallback, clear query cache and return None
             autotvm.task.clear_fallback_cache(target, workload)
             return None

         assert data_layout == "NHWC" and kernel_layout == "HWIO"
         KH, KW, _, OC = get_const_tuple(kernel.shape)
         new_workload_name = "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu"
         new_kernel, new_kernel_expr = interleave_transpose_weights(
             inputs, data, kernel, interleave_A=True
         )
         new_workload = autotvm.task.args_to_workload(
             [data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), OC],
             new_workload_name,
         )
         dispatch_ctx.update(target, new_workload, cfg)

         return relay.nn.contrib_conv2d_gemm_without_weight_transform(
             inputs[0], new_kernel_expr, **new_attrs
         )
     if topi_tmpl == "conv2d_NHWC_quantized_native.arm_cpu":
         # TODO(masahi): This schedule can easily result in a tensorization error
         # if used in the fallback mode
         if cfg.is_fallback:  # if is fallback, clear query cache and return None
             autotvm.task.clear_fallback_cache(target, workload)
             return None

         assert data_layout == "NHWC" and kernel_layout == "HWIO"
         KH, KW, _, OC = get_const_tuple(kernel.shape)
         new_workload_name = "conv2d_NHWC_quantized_native_without_transform.arm_cpu"
         new_kernel, new_kernel_expr = interleave_transpose_weights(
             inputs, data, kernel, interleave_A=False
         )
         new_workload = autotvm.task.args_to_workload(
             [data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), OC],
             new_workload_name,
         )
         dispatch_ctx.update(target, new_workload, cfg)
         return relay.nn.contrib_conv2d_gemm_without_weight_transform(
             inputs[0], new_kernel_expr, **new_attrs
         )
     return None


 @conv2d_legalize.register("arm_cpu")
 def _conv2d_legalize(attrs, inputs, arg_types):
     """Legalizes Conv2D op.

     Parameters
     ----------
     attrs : tvm.ir.Attrs
         Attributes of current convolution
     inputs : list of tvm.relay.Expr
         The args of the Relay expr to be legalized
     types : list of types
         List of input and output types

     Returns
     -------
     result : tvm.relay.Expr
         The legalized expr
     """
     # Collect the input tensors.
     data_tensor, kernel_tensor = arg_types[0], arg_types[1]
     data_dtype = data_tensor.dtype
     kernel_dtype = kernel_tensor.dtype

     # Collect the output tensor.
     output_tensor = arg_types[2]

     # Collect the input exprs.
     data, kernel = inputs

     # ARM vector instructions operate on the same dtype for data and kernel, we
     # provide those here and conv2d_alter_int8_common will convert to the
     # correct datatype.
     if is_int8_hw_support(kernel_dtype, kernel_dtype):
         # ARM intrinsics need the datatypes of data and kernel to be the same
         return conv2d_alter_int8_common(
             data, data_tensor, kernel, kernel_tensor, output_tensor, attrs, kernel_dtype, 8, 8
         )
     return None
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
	"""Conv2D alter op and legalize functions for arm cpu"""

	import logging

	import tvm
	from tvm import te
	from tvm import relay
	from tvm import autotvm

	from ..nn import conv2d_alter_layout, conv2d_legalize
	from ..utils import get_const_tuple
	from ..x86.conv2d import _get_default_config as _get_x86_default_config
	from ..x86.conv2d_int8 import _get_default_config_int8
	from .conv2d_int8 import is_int8_hw_support
	from .arm_utils import get_tiling_B_interleaved_t
	from ..generic.conv2d import conv2d_alter_int8_common

	logger = logging.getLogger("topi")


	def interleave_transpose_weights(inputs, data, kernel, interleave_A):
	"""Transform the weight matrix by reshaping, interleaving and transposing it

	Parameters
	----------
	inputs : tvm.relay.Expr
	Grouped input symbols
	data :
	Input shape and dtype
	kernel :
	Input shape and dtype
	interleave_A: indicates if we expect matrix A to be interleaved

	Returns
	----------
	new_kernel : tvm.te.placeholder
	A placeholder with the new shape
	new_kernel_expr : tvm.relay.Expr
	The relay expression of the weights
	"""
	assert (
	data.dtype == "int8"
	and kernel.dtype == "int8"
	or data.dtype == "uint8"
	and kernel.dtype == "uint8"
	)

	KH, KW, IC, OC = get_const_tuple(kernel.shape)
	K = KH * KW * IC
	N = OC

	# Get tiling information for the interleaved transposed version of B
	tile_rows_B, tile_cols_B = get_tiling_B_interleaved_t(interleave_A)

	pad_K = 0
	pad_N = 0

	if N % tile_rows_B != 0:
	pad_N = tile_rows_B - (N % tile_rows_B)
	if K % tile_cols_B != 0:
	pad_K = tile_cols_B - (K % tile_cols_B)

	N_padded = N + pad_N
	K_padded = K + pad_K
	new_kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform(
	inputs[1], tile_rows_B, tile_cols_B
	)
	new_kernel = te.placeholder(
	(N_padded // tile_rows_B, K_padded // tile_cols_B, tile_rows_B, tile_cols_B), kernel.dtype
	)
	return new_kernel, new_kernel_expr


	@conv2d_alter_layout.register(["arm_cpu"])
	def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
	target = tvm.target.Target.current(allow_none=False)
	dispatch_ctx = autotvm.task.DispatchContext.current

	_, outs = relay.backend.te_compiler.select_implementation(
	relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
	)
	workload = autotvm.task.get_workload(outs)
	if workload is None:
	# The best implementation is not an AutoTVM template,
	# we then assume it's not necessary to alter this op.
	return None
	cfg = dispatch_ctx.query(target, workload)

	topi_tmpl = workload[0]
	new_attrs = {k: attrs[k] for k in attrs.keys()}

	strides = attrs.get_int_tuple("strides")
	padding = attrs.get_int_tuple("padding")
	dilation = attrs.get_int_tuple("dilation")
	data_layout = attrs["data_layout"]
	kernel_layout = attrs["kernel_layout"]
	data, kernel = tinfos
	out_dtype = out_type.dtype

	# Extract data types
	data_tensor, kernel_tensor = tinfos
	data_dtype = data_tensor.dtype
	kernel_dtype = kernel_tensor.dtype

	idxd = tvm.tir.indexdiv

	# We don't perform layout alteration for NHWC layout with real data types
	if data_layout == "NHWC" and data_dtype not in ["uint8", "int8"]:
	return None

	if topi_tmpl == "conv2d_nchw_spatial_pack.arm_cpu":
	assert data_layout == "NCHW" and kernel_layout == "OIHW"
	N, CI, H, W = get_const_tuple(data.shape)
	CO, _, KH, KW = get_const_tuple(kernel.shape)
	VC = cfg["tile_co"].size[-1]

	new_attrs["kernel_layout"] = "OIHW%do" % VC

	new_data = data
	new_kernel = te.placeholder((idxd(CO, VC), CI, KH, KW, VC), dtype=kernel.dtype)
	new_workload = autotvm.task.args_to_workload(
	[new_data, new_kernel, strides, padding, dilation, out_dtype],
	"conv2d_nchw_spatial_pack.arm_cpu",
	)
	dispatch_ctx.update(target, new_workload, cfg)

	return relay.nn.conv2d(inputs, *new_attrs)

	if topi_tmpl == "conv2d_nhwc_spatial_pack.arm_cpu":
	assert (
	data.dtype == "int8"
	and kernel.dtype == "int8"
	or data.dtype == "uint8"
	and kernel.dtype == "uint8"
	)

	assert data_layout == "NHWC" and kernel_layout == "HWIO"

	data_expr, kernel_expr = inputs

	data_int16 = relay.cast(data_expr, dtype="int16")
	kernel_int16 = relay.cast(kernel_expr, dtype="int16")

	new_attrs = {k: attrs[k] for k in attrs.keys()}

	new_data = te.placeholder(data.shape, "int16")
	new_kernel = te.placeholder(kernel.shape, "int16")

	new_workload = autotvm.task.args_to_workload(
	[new_data, new_kernel, strides, padding, dilation, out_dtype],
	"conv2d_nhwc_spatial_pack.arm_cpu",
	)
	dispatch_ctx.update(target, new_workload, cfg)

	return relay.nn.conv2d(data_int16, kernel_int16, **new_attrs)

	if topi_tmpl == "conv2d_nchw_winograd.arm_cpu":
	assert data_layout == "NCHW" and kernel_layout == "OIHW"
	N, CI, H, W = get_const_tuple(data.shape)
	CO, _, KH, KW = get_const_tuple(kernel.shape)
	VC = cfg["tile_k"].size[-1]
	tile_size = 4

	weight_expr = inputs[1]
	weight_expr = relay.nn.contrib_conv2d_winograd_weight_transform(
	weight_expr, tile_size=tile_size
	)
	weight_expr = relay.reshape(
	weight_expr, newshape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI)
	)
	weight_expr = relay.transpose(weight_expr, axes=[0, 1, 2, 4, 3])

	new_attrs["tile_size"] = tile_size
	new_attrs["channels"] = CO

	new_data = data
	new_kernel = te.placeholder(
	(KH + tile_size - 1, KW + tile_size - 1, idxd(CO, VC), CI, VC), kernel.dtype
	)
	new_workload = autotvm.task.args_to_workload(
	[new_data, new_kernel, strides, padding, dilation, out_dtype],
	"conv2d_nchw_winograd.arm_cpu",
	)
	dispatch_ctx.update(target, new_workload, cfg)

	return relay.nn.contrib_conv2d_winograd_without_weight_transform(
	inputs[0], weight_expr, **new_attrs
	)

	if topi_tmpl == "conv2d_nchw_winograd_nnpack.arm_cpu":
	assert data_layout == "NCHW" and kernel_layout == "OIHW"
	N, CI, H, W = get_const_tuple(data.shape)
	CO, _, KH, KW = get_const_tuple(kernel.shape)
	new_attrs["channels"] = CO

	# pre-compute winograd_nnpack transform
	# for winograd_nnpack_fp16, the precompute prune pass must run on device,
	# where float16 is supported
	weight_dtype = "float32"
	weight_expr = inputs[1]
	transformed_weight = relay.nn.contrib_conv2d_winograd_nnpack_weight_transform(
	weight_expr,
	convolution_algorithm=cfg["winograd_nnpack_algorithm"].val,
	out_dtype=weight_dtype,
	)

	new_data = data
	new_kernel = te.placeholder((CO, CI, 8, 8), "float32")

	new_workload = autotvm.task.args_to_workload(
	[new_data, new_kernel, None, strides, padding, dilation, out_dtype],
	"conv2d_nchw_winograd_nnpack_without_weight_transform.arm_cpu",
	)
	dispatch_ctx.update(target, new_workload, cfg)
	return relay.nn.contrib_conv2d_winograd_without_weight_transform(
	inputs[0], transformed_weight, **new_attrs
	)

	if topi_tmpl == "depthwise_conv2d_nchw_spatial_pack.arm_cpu":
	assert data_layout == "NCHW" and kernel_layout == "OIHW"
	N, CI, H, W = get_const_tuple(data.shape)
	CO, M, KH, KW = get_const_tuple(kernel.shape)
	VC = cfg["tile_co"].size[-1]

	new_attrs["kernel_layout"] = "OIHW%do" % (cfg["tile_co"].size[-1])

	# Store the same config for the altered operator (workload)
	new_data = data
	new_kernel = te.placeholder((idxd(CO, VC), M, KH, KW, VC), dtype=kernel.dtype)
	new_workload = autotvm.task.args_to_workload(
	[new_data, new_kernel, strides, padding, dilation, out_dtype],
	"depthwise_conv2d_nchw_spatial_pack.arm_cpu",
	)
	dispatch_ctx.update(target, new_workload, cfg)

	return relay.nn.conv2d(inputs, *new_attrs)

	if topi_tmpl == "conv2d_NCHWc.x86":
	# Converting NCHW to NCHWc.
	assert data_layout == "NCHW" and kernel_layout == "OIHW"
	if cfg.is_fallback:
	_get_x86_default_config(
	cfg,
	data_tensor,
	kernel_tensor,
	strides,
	padding,
	dilation,
	out_dtype,
	False,
	data_layout,
	)
	batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
	out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
	ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]

	# update new attrs
	new_attrs["channels"] = out_channel
	new_attrs["data_layout"] = "NCHW%dc" % ic_bn
	# (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
	new_attrs["kernel_layout"] = "OIHW%di%do" % (ic_bn, oc_bn)
	new_attrs["out_layout"] = "NCHW%dc" % oc_bn

	# Store altered operator's config
	new_data = te.placeholder(
	(batch_size, in_channel // ic_bn, height, width, ic_bn), dtype=data_dtype
	)
	new_kernel = te.placeholder(
	(out_channel // oc_bn, in_channel // ic_bn, kh, kw, ic_bn, oc_bn),
	dtype=kernel_tensor.dtype,
	)
	new_workload = autotvm.task.args_to_workload(
	[
	new_data,
	new_kernel,
	strides,
	padding,
	dilation,
	new_attrs["data_layout"],
	new_attrs["out_layout"],
	out_dtype,
	],
	topi_tmpl,
	)
	dispatch_ctx.update(target, new_workload, cfg)
	return relay.nn.contrib_conv2d_nchwc(inputs, *new_attrs)

	if topi_tmpl == "depthwise_conv2d_NCHWc.x86":
	# Converting NCHW to NCHWc.
	assert data_layout == "NCHW" and kernel_layout == "OIHW"
	if cfg.is_fallback:
	_get_x86_default_config(
	cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, True, data_layout
	)

	batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
	out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape)
	ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
	assert channel_multiplier == 1

	# update new attrs
	new_attrs["channels"] = out_channel
	new_attrs["data_layout"] = "NCHW%dc" % ic_bn
	new_attrs["kernel_layout"] = "OIHW1i%do" % oc_bn
	new_attrs["out_layout"] = "NCHW%dc" % oc_bn

	# Store altered operator's config.
	new_data = te.placeholder(
	(batch_size, in_channel // ic_bn, height, width, ic_bn), dtype=data_dtype
	)
	new_kernel = te.placeholder((out_channel // oc_bn, 1, kh, kw, 1, oc_bn), dtype=kernel_dtype)
	new_workload = autotvm.task.args_to_workload(
	[
	new_data,
	new_kernel,
	strides,
	padding,
	dilation,
	new_attrs["data_layout"],
	new_attrs["out_layout"],
	out_dtype,
	],
	topi_tmpl,
	)
	dispatch_ctx.update(target, new_workload, cfg)
	return relay.nn.contrib_depthwise_conv2d_nchwc(inputs, *new_attrs)

	if topi_tmpl == "conv2d_NCHWc_int8.arm_cpu":
	assert data_layout == "NCHW" and kernel_layout == "OIHW"
	batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
	out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)

	n_elems = 4

	if cfg.is_fallback:
	_get_default_config_int8(
	cfg,
	data_tensor,
	kernel_tensor,
	strides,
	padding,
	dilation,
	out_dtype,
	False,
	data_layout,
	int32_lanes=4,
	)

	ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]

	if cfg.is_fallback:
	# ic_bn needs to be divided by n_elems below
	ic_bn = max(ic_bn, n_elems)

	# update new attrs
	new_attrs["channels"] = out_channel
	new_attrs["data_layout"] = "NCHW%dc" % ic_bn
	new_attrs["kernel_layout"] = "OIHW{:n}i{:n}o{:n}i".format(ic_bn // n_elems, oc_bn, n_elems)
	new_attrs["out_layout"] = "NCHW%dc" % oc_bn

	# Store altered operator's config.
	new_data = te.placeholder(
	(batch_size, in_channel // ic_bn, height, width, ic_bn), dtype=data_dtype
	)
	new_kernel = te.placeholder(
	(out_channel // oc_bn, in_channel // ic_bn, kh, kw, ic_bn // n_elems, oc_bn, n_elems),
	dtype=kernel_dtype,
	)
	new_workload = autotvm.task.args_to_workload(
	[
	new_data,
	new_kernel,
	strides,
	padding,
	dilation,
	new_attrs["data_layout"],
	new_attrs["out_layout"],
	out_dtype,
	],
	topi_tmpl,
	)
	dispatch_ctx.update(target, new_workload, cfg)
	return relay.nn.contrib_conv2d_nchwc(inputs, *new_attrs)

	if topi_tmpl == "conv2d_NHWC_quantized_interleaved.arm_cpu":
	# TODO(masahi): This schedule can easily result in a tensorization error
	# if used in the fallback mode
	if cfg.is_fallback: # if is fallback, clear query cache and return None
	autotvm.task.clear_fallback_cache(target, workload)
	return None

	assert data_layout == "NHWC" and kernel_layout == "HWIO"
	KH, KW, _, OC = get_const_tuple(kernel.shape)
	new_workload_name = "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu"
	new_kernel, new_kernel_expr = interleave_transpose_weights(
	inputs, data, kernel, interleave_A=True
	)
	new_workload = autotvm.task.args_to_workload(
	[data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), OC],
	new_workload_name,
	)
	dispatch_ctx.update(target, new_workload, cfg)

	return relay.nn.contrib_conv2d_gemm_without_weight_transform(
	inputs[0], new_kernel_expr, **new_attrs
	)
	if topi_tmpl == "conv2d_NHWC_quantized_native.arm_cpu":
	# TODO(masahi): This schedule can easily result in a tensorization error
	# if used in the fallback mode
	if cfg.is_fallback: # if is fallback, clear query cache and return None
	autotvm.task.clear_fallback_cache(target, workload)
	return None

	assert data_layout == "NHWC" and kernel_layout == "HWIO"
	KH, KW, _, OC = get_const_tuple(kernel.shape)
	new_workload_name = "conv2d_NHWC_quantized_native_without_transform.arm_cpu"
	new_kernel, new_kernel_expr = interleave_transpose_weights(
	inputs, data, kernel, interleave_A=False
	)
	new_workload = autotvm.task.args_to_workload(
	[data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), OC],
	new_workload_name,
	)
	dispatch_ctx.update(target, new_workload, cfg)
	return relay.nn.contrib_conv2d_gemm_without_weight_transform(
	inputs[0], new_kernel_expr, **new_attrs
	)
	return None


	@conv2d_legalize.register("arm_cpu")
	def _conv2d_legalize(attrs, inputs, arg_types):
	"""Legalizes Conv2D op.

	Parameters
	----------
	attrs : tvm.ir.Attrs
	Attributes of current convolution
	inputs : list of tvm.relay.Expr
	The args of the Relay expr to be legalized
	types : list of types
	List of input and output types

	Returns
	-------
	result : tvm.relay.Expr
	The legalized expr
	"""
	# Collect the input tensors.
	data_tensor, kernel_tensor = arg_types[0], arg_types[1]
	data_dtype = data_tensor.dtype
	kernel_dtype = kernel_tensor.dtype

	# Collect the output tensor.
	output_tensor = arg_types[2]

	# Collect the input exprs.
	data, kernel = inputs

	# ARM vector instructions operate on the same dtype for data and kernel, we
	# provide those here and conv2d_alter_int8_common will convert to the
	# correct datatype.
	if is_int8_hw_support(kernel_dtype, kernel_dtype):
	# ARM intrinsics need the datatypes of data and kernel to be the same
	return conv2d_alter_int8_common(
	data, data_tensor, kernel, kernel_tensor, output_tensor, attrs, kernel_dtype, 8, 8
	)
	return None