python/tvm/topi/nn/depthwise_conv2d.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name, unused-variable, too-many-locals, unused-argument
 """Depthwise convolution operators"""
 from __future__ import absolute_import as _abs
 from collections import namedtuple
 import tvm
 import numpy as np
 from tvm import te

 from .dilate import dilate
 from .pad import pad
 from .utils import get_pad_tuple
 from ..utils import simplify, get_const_tuple

 # workload description of depthwise-conv2d
 Workload = namedtuple(
     "Workload",
     [
         "in_dtype",
         "out_dtype",
         "height",
         "width",
         "in_filter",
         "out_filter",
         "kernel_h",
         "kernel_w",
         "padt",
         "padl",
         "padb",
         "padr",
         "dilation_h",
         "dilation_w",
         "stride_h",
         "stride_w",
     ],
 )


 def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layout="NCHW"):
     """Get the workload structure for a depthwise conv2d.

     Input data and filter should use NCHW layout.
     """
     if data_layout == "NCHW":
         _, in_channel, height, width = get_const_tuple(data.shape)
         filter_channel, channel_multiplier, kh, kw = get_const_tuple(kernel.shape)
     elif data_layout == "NHWC":
         _, height, width, in_channel = get_const_tuple(data.shape)
         kh, kw, filter_channel, channel_multiplier = get_const_tuple(kernel.shape)
     elif data_layout == "NCHWc":
         _, in_channel_chunk, height, width, in_channel_block = get_const_tuple(data.shape)
         in_channel = in_channel_chunk * in_channel_block
         (
             filter_channel_chunk,
             cm_chunk,
             kh,
             kw,
             cm_block,
             filter_channel_block,
         ) = get_const_tuple(kernel.shape)
         filter_channel = filter_channel_chunk * filter_channel_block
         channel_multiplier = cm_chunk * cm_block

         assert (
             in_channel_block == filter_channel_block
         ), "Incorrect dimensions, data has block size {}, but filter has block size {}".format(
             in_channel_block, filter_channel_block
         )

     else:
         raise ValueError("Data layout {} not supported".format(data_layout))

     assert (
         in_channel == filter_channel
     ), "Incorrect dimensions, data has {} channels but filter expects {} channels".format(
         in_channel, filter_channel
     )

     out_channel = filter_channel * channel_multiplier
     dilation_h, dilation_w = (
         dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
     )
     if isinstance(stride, (tuple, list)):
         HSTR, WSTR = stride
     else:
         HSTR, WSTR = stride, stride
     assert (data.dtype == kernel.dtype) or (
         data.dtype == "uint8" and kernel.dtype == "int8"
     ), "Do not support inputs with different data types now. ' \
         '{} vs. {}".format(
         data.dtype, kernel.dtype
     )
     dilated_kernel_h = (kh - 1) * dilation_h + 1
     dilated_kernel_w = (kw - 1) * dilation_w + 1
     pt, pl, pb, pr = get_pad_tuple(padding, (dilated_kernel_h, dilated_kernel_w))
     return Workload(
         data.dtype,
         out_dtype,
         height,
         width,
         in_channel,
         out_channel,
         kh,
         kw,
         pt,
         pl,
         pb,
         pr,
         dilation_h,
         dilation_w,
         HSTR,
         WSTR,
     )


 def depthwise_conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
     """Depthwise convolution nchw forward operator.

     Parameters
     ----------
     Input : tvm.te.Tensor
         4-D with shape [batch, in_channel, in_height, in_width]

     Filter : tvm.te.Tensor
         4-D with shape [in_channel, channel_multiplier, filter_height, filter_width]

     stride : int or a list/tuple of two ints
         The spatial stride, or (stride_height, stride_width).

     padding : int or str
         Padding size, or ['VALID', 'SAME']

     dilation: int or a list/tuple of two ints
         dilation size, or [dilation_height, dilation_width]

     out_dtype: str, optional
         Output data type

     Returns
     -------
     Output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
     out_dtype = Input.dtype if out_dtype is None else out_dtype

     if isinstance(stride, int):
         stride_h = stride_w = stride
     else:
         stride_h, stride_w = stride

     if isinstance(dilation, int):
         dilation_h = dilation_w = dilation
     else:
         dilation_h, dilation_w = dilation

     batch, in_channel, in_height, in_width = Input.shape
     # shape of dilated kernel
     filter_channel, channel_multiplier, filter_height, filter_width = Filter.shape

     dilated_kernel_h = (filter_height - 1) * dilation_h + 1
     dilated_kernel_w = (filter_width - 1) * dilation_w + 1
     pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
         padding, (dilated_kernel_h, dilated_kernel_w)
     )
     out_channel = simplify(in_channel * channel_multiplier)
     out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
     out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

     # padding stage
     pad_before = [0, 0, pad_top, pad_left]
     pad_after = [0, 0, pad_down, pad_right]
     PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
     # depthconv stage
     idxdiv = tvm.tir.indexdiv
     idxmod = tvm.tir.indexmod
     di = te.reduce_axis((0, filter_height), name="di")
     dj = te.reduce_axis((0, filter_width), name="dj")
     Output = te.compute(
         (batch, out_channel, out_height, out_width),
         lambda b, c, i, j: te.sum(
             (
                 PaddedInput[
                     b,
                     idxdiv(c, channel_multiplier),
                     i * stride_h + di * dilation_h,
                     j * stride_w + dj * dilation_w,
                 ].astype(out_dtype)
                 * Filter[
                     idxdiv(c, channel_multiplier), idxmod(c, channel_multiplier), di, dj
                 ].astype(out_dtype)
             ),
             axis=[di, dj],
         ),
         name="DepthwiseConv2d",
         tag="depthwise_conv2d_nchw",
     )
     return Output


 def depthwise_conv2d_nhwc(
     Input, Filter, stride, padding, dilation, kernel_layout="HWOI", out_dtype=None
 ):
     """Depthwise convolution nhwc forward operator.

     Parameters
     ----------
     Input : tvm.te.Tensor
         4-D with shape [batch, in_height, in_width, in_channel]

     Filter : tvm.te.Tensor
         4-D with shape [filter_height, filter_width, in_channel, channel_multiplier]

     stride : tuple of two ints
         The spatial stride along height and width

     padding : int or str
         Padding size, or ['VALID', 'SAME']

     dilation: int or a list/tuple of two ints
         dilation size, or [dilation_height, dilation_width]

     out_dtype: str, optional
         Output data type

     Returns
     -------
     Output : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
     out_dtype = Input.dtype if out_dtype is None else out_dtype

     if isinstance(stride, int):
         stride_h = stride_w = stride
     else:
         stride_h, stride_w = stride

     if isinstance(dilation, int):
         dilation_h = dilation_w = dilation
     else:
         dilation_h, dilation_w = dilation

     batch, in_height, in_width, in_channel = Input.shape

     # shape of dilated kernel
     if kernel_layout == "HWIO":
         filter_height, filter_width, channel_multiplier, filter_channel = Filter.shape
         kernel_permutation = [0, 1, 3, 2]
     else:
         filter_height, filter_width, filter_channel, channel_multiplier = Filter.shape
         kernel_permutation = [0, 1, 2, 3]

     dilated_kernel_h = (filter_height - 1) * dilation_h + 1
     dilated_kernel_w = (filter_width - 1) * dilation_w + 1
     pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
         padding, (dilated_kernel_h, dilated_kernel_w)
     )
     out_channel = simplify(in_channel * channel_multiplier)
     out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
     out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

     # padding stage
     pad_before = [0, pad_top, pad_left, 0]
     pad_after = [0, pad_down, pad_right, 0]
     PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
     # depthconv stage
     idxdiv = tvm.tir.indexdiv
     idxmod = tvm.tir.indexmod

     di = te.reduce_axis((0, filter_height), name="di")
     dj = te.reduce_axis((0, filter_width), name="dj")
     Output = te.compute(
         (batch, out_height, out_width, out_channel),
         lambda b, i, j, c: te.sum(
             (
                 PaddedInput[
                     b,
                     i * stride_h + di * dilation_h,
                     j * stride_w + dj * dilation_w,
                     idxdiv(c, channel_multiplier),
                 ].astype(out_dtype)
                 * Filter[
                     tuple(
                         np.array(
                             [di, dj, idxdiv(c, channel_multiplier), idxmod(c, channel_multiplier)]
                         )[kernel_permutation]
                     )
                 ].astype(out_dtype)
             ),
             axis=[di, dj],
         ),
         name="DepthwiseConv2d",
         tag="depthwise_conv2d_nhwc",
     )
     return Output


 def depthwise_conv2d_backward_input_nhwc(Filter, Out_grad, oshape, ishape, stride, padding):
     """Depthwise convolution nhwc backward wrt input operator.

     Parameters
     ----------
     Filter : tvm.te.Tensor
         4-D with shape [filter_height, filter_width, in_channel, channel_multiplier]

     Out_grad : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]

     stride : tuple of two ints
         The spatial stride along height and width

     padding : int or str
         Padding size, or ['VALID', 'SAME']

     Returns
     -------
     Output : tvm.te.Tensor
         4-D with shape [batch, in_height, in_width, in_channel]
     """
     batch, in_h, in_w, in_c = ishape
     _, out_h, out_w, out_c = oshape
     filter_h, filter_w, _, channel_multiplier = Filter.shape
     if isinstance(stride, int):
         stride_h = stride_w = stride
     else:
         stride_h, stride_w = stride

     dilated_out_grad = dilate(Out_grad, [1, stride_h, stride_w, 1], name="dilated_out_grad")

     # padding params in forward propagation
     fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(padding, (filter_h, filter_w))
     # padding params in backward propagation
     bpad_top = filter_h - 1 - fpad_top
     bpad_bottom = (filter_h - 1 - fpad_bottom) + (stride_h - 1)
     bpad_left = filter_w - 1 - fpad_left
     bpad_right = (filter_w - 1 - fpad_right) + (stride_w - 1)

     padded_out_grad = pad(
         dilated_out_grad,
         [0, bpad_top, bpad_left, 0],
         [0, bpad_bottom, bpad_right, 0],
         name="padded_out_grad",
     )

     dh = te.reduce_axis((0, filter_h), name="dh")
     dw = te.reduce_axis((0, filter_w), name="dw")
     dc = te.reduce_axis((0, channel_multiplier), name="dc")

     In_grad = te.compute(
         (batch, in_h, in_w, in_c),
         lambda b, h, w, c: te.sum(
             padded_out_grad[b, h + dh, w + dw, c * channel_multiplier + dc]
             * Filter[filter_h - 1 - dh, filter_w - 1 - dw, c, dc],
             axis=[dh, dw, dc],
         ),
         tag="depthwise_conv2d_backward_input_nhwc",
     )

     return In_grad


 def depthwise_conv2d_backward_weight_nhwc(Input, Out_grad, oshape, fshape, stride, padding):
     """Depthwise convolution nhwc backward wrt weight operator.

     Parameters
     ----------
     Input : tvm.te.Tensor
         4-D with shape [batch, in_height, in_width, in_channel]

     Out_grad : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]

     stride : tuple of two ints
         The spatial stride along height and width

     padding : int or str
         Padding size, or ['VALID', 'SAME']

     Returns
     -------
     Output : tvm.te.Tensor
         4-D with shape [filter_height, filter_width, in_channel, channel_multiplier]
     """
     batch, out_h, out_w, out_c = oshape
     filter_h, filter_w, _, channel_multiplier = fshape
     in_c = Input.shape[3].value
     if isinstance(stride, int):
         stride_h = stride_w = stride
     else:
         stride_h, stride_w = stride

     pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (filter_h, filter_w))

     padded_in = pad(
         Input, [0, pad_top, pad_left, 0], [0, pad_bottom, pad_right, 0], name="padded_in"
     )

     dh = te.reduce_axis((0, Out_grad.shape[1].value), name="dh")
     dw = te.reduce_axis((0, Out_grad.shape[2].value), name="dw")
     db = te.reduce_axis((0, batch), name="db")
     idxdiv = tvm.tir.indexdiv
     idxmod = tvm.tir.indexmod

     Weight_grad = te.compute(
         (filter_h, filter_w, in_c, channel_multiplier),
         lambda fh, fw, c, m: te.sum(
             Out_grad[db, dh, dw, c * channel_multiplier + idxmod(m, channel_multiplier)]
             * padded_in[db, fh + dh * stride_h, fw + dw * stride_w, c],
             axis=[db, dh, dw],
         ),
         tag="depthwise_conv2d_backward_weight_nhwc",
     )

     return Weight_grad


 def depthwise_conv2d_NCHWc(
     Input, Filter, stride, padding, dilation, layout, out_layout, out_dtype=None
 ):
     """Depthwise convolution NCHW[x]c forward operator.

     Parameters
     ----------
     Input : tvm.te.Tensor
         5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]

     Filter : tvm.te.Tensor
         6-D with shape [out_channel_chunk, 1, filter_height, filter_width, 1, out_channel_block]
         In NCHWc depthwise convolution,
         we group kernel's in_channel and channel_multiplier together then do the tiling.

     stride : tuple of two ints
         The spatial stride along height and width

     padding : int or str
         Padding size, or ['VALID', 'SAME']

     dilation: int or a list/tuple of two ints
          dilation size, or [dilation_height, dilation_width]

     layout : str
         Input data layout

     out_layout : str
         Output data layout

     out_dtype: str, optional
         Output data type

     Returns
     -------
     Output : tvm.te.Tensor
         5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
     """
     raise ValueError("missing register for topi.nn.depthwise_conv2d_NCHWc")


 @tvm.target.generic_func
 def depthwise_conv2d_infer_layout(workload, cfg):
     """Infer input/output shapes and layouts from a workload and cfg.

     Parameters
     ----------
     workload : tuple
         conv2d workload

     cfg : tuple
         tvm.autotvm config

     Returns
     -------
     Output : [tuple of tuple and str, tuple of tuple and str]
         Input shapes and layouts, and output shapes and layouts
     """
     raise ValueError("missing register for topi.nn.depthwise_conv2d_infer_layout")
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	# pylint: disable=invalid-name, unused-variable, too-many-locals, unused-argument
	"""Depthwise convolution operators"""
	from __future__ import absolute_import as _abs
	from collections import namedtuple
	import tvm
	import numpy as np
	from tvm import te

	from .dilate import dilate
	from .pad import pad
	from .utils import get_pad_tuple
	from ..utils import simplify, get_const_tuple

	# workload description of depthwise-conv2d
	Workload = namedtuple(
	"Workload",
	[
	"in_dtype",
	"out_dtype",
	"height",
	"width",
	"in_filter",
	"out_filter",
	"kernel_h",
	"kernel_w",
	"padt",
	"padl",
	"padb",
	"padr",
	"dilation_h",
	"dilation_w",
	"stride_h",
	"stride_w",
	],
	)


	def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layout="NCHW"):
	"""Get the workload structure for a depthwise conv2d.

	Input data and filter should use NCHW layout.
	"""
	if data_layout == "NCHW":
	_, in_channel, height, width = get_const_tuple(data.shape)
	filter_channel, channel_multiplier, kh, kw = get_const_tuple(kernel.shape)
	elif data_layout == "NHWC":
	_, height, width, in_channel = get_const_tuple(data.shape)
	kh, kw, filter_channel, channel_multiplier = get_const_tuple(kernel.shape)
	elif data_layout == "NCHWc":
	_, in_channel_chunk, height, width, in_channel_block = get_const_tuple(data.shape)
	in_channel = in_channel_chunk * in_channel_block
	(
	filter_channel_chunk,
	cm_chunk,
	kh,
	kw,
	cm_block,
	filter_channel_block,
	) = get_const_tuple(kernel.shape)
	filter_channel = filter_channel_chunk * filter_channel_block
	channel_multiplier = cm_chunk * cm_block

	assert (
	in_channel_block == filter_channel_block
	), "Incorrect dimensions, data has block size {}, but filter has block size {}".format(
	in_channel_block, filter_channel_block
	)

	else:
	raise ValueError("Data layout {} not supported".format(data_layout))

	assert (
	in_channel == filter_channel
	), "Incorrect dimensions, data has {} channels but filter expects {} channels".format(
	in_channel, filter_channel
	)

	out_channel = filter_channel * channel_multiplier
	dilation_h, dilation_w = (
	dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)
	)
	if isinstance(stride, (tuple, list)):
	HSTR, WSTR = stride
	else:
	HSTR, WSTR = stride, stride
	assert (data.dtype == kernel.dtype) or (
	data.dtype == "uint8" and kernel.dtype == "int8"
	), "Do not support inputs with different data types now. ' \
	'{} vs. {}".format(
	data.dtype, kernel.dtype
	)
	dilated_kernel_h = (kh - 1) * dilation_h + 1
	dilated_kernel_w = (kw - 1) * dilation_w + 1
	pt, pl, pb, pr = get_pad_tuple(padding, (dilated_kernel_h, dilated_kernel_w))
	return Workload(
	data.dtype,
	out_dtype,
	height,
	width,
	in_channel,
	out_channel,
	kh,
	kw,
	pt,
	pl,
	pb,
	pr,
	dilation_h,
	dilation_w,
	HSTR,
	WSTR,
	)


	def depthwise_conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
	"""Depthwise convolution nchw forward operator.

	Parameters
	----------
	Input : tvm.te.Tensor
	4-D with shape [batch, in_channel, in_height, in_width]

	Filter : tvm.te.Tensor
	4-D with shape [in_channel, channel_multiplier, filter_height, filter_width]

	stride : int or a list/tuple of two ints
	The spatial stride, or (stride_height, stride_width).

	padding : int or str
	Padding size, or ['VALID', 'SAME']

	dilation: int or a list/tuple of two ints
	dilation size, or [dilation_height, dilation_width]

	out_dtype: str, optional
	Output data type

	Returns
	-------
	Output : tvm.te.Tensor
	4-D with shape [batch, out_channel, out_height, out_width]
	"""
	out_dtype = Input.dtype if out_dtype is None else out_dtype

	if isinstance(stride, int):
	stride_h = stride_w = stride
	else:
	stride_h, stride_w = stride

	if isinstance(dilation, int):
	dilation_h = dilation_w = dilation
	else:
	dilation_h, dilation_w = dilation

	batch, in_channel, in_height, in_width = Input.shape
	# shape of dilated kernel
	filter_channel, channel_multiplier, filter_height, filter_width = Filter.shape

	dilated_kernel_h = (filter_height - 1) * dilation_h + 1
	dilated_kernel_w = (filter_width - 1) * dilation_w + 1
	pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
	padding, (dilated_kernel_h, dilated_kernel_w)
	)
	out_channel = simplify(in_channel * channel_multiplier)
	out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
	out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

	# padding stage
	pad_before = [0, 0, pad_top, pad_left]
	pad_after = [0, 0, pad_down, pad_right]
	PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
	# depthconv stage
	idxdiv = tvm.tir.indexdiv
	idxmod = tvm.tir.indexmod
	di = te.reduce_axis((0, filter_height), name="di")
	dj = te.reduce_axis((0, filter_width), name="dj")
	Output = te.compute(
	(batch, out_channel, out_height, out_width),
	lambda b, c, i, j: te.sum(
	(
	PaddedInput[
	b,
	idxdiv(c, channel_multiplier),
	i * stride_h + di * dilation_h,
	j * stride_w + dj * dilation_w,
	].astype(out_dtype)
	* Filter[
	idxdiv(c, channel_multiplier), idxmod(c, channel_multiplier), di, dj
	].astype(out_dtype)
	),
	axis=[di, dj],
	),
	name="DepthwiseConv2d",
	tag="depthwise_conv2d_nchw",
	)
	return Output


	def depthwise_conv2d_nhwc(
	Input, Filter, stride, padding, dilation, kernel_layout="HWOI", out_dtype=None
	):
	"""Depthwise convolution nhwc forward operator.

	Parameters
	----------
	Input : tvm.te.Tensor
	4-D with shape [batch, in_height, in_width, in_channel]

	Filter : tvm.te.Tensor
	4-D with shape [filter_height, filter_width, in_channel, channel_multiplier]

	stride : tuple of two ints
	The spatial stride along height and width

	padding : int or str
	Padding size, or ['VALID', 'SAME']

	dilation: int or a list/tuple of two ints
	dilation size, or [dilation_height, dilation_width]

	out_dtype: str, optional
	Output data type

	Returns
	-------
	Output : tvm.te.Tensor
	4-D with shape [batch, out_height, out_width, out_channel]
	"""
	out_dtype = Input.dtype if out_dtype is None else out_dtype

	if isinstance(stride, int):
	stride_h = stride_w = stride
	else:
	stride_h, stride_w = stride

	if isinstance(dilation, int):
	dilation_h = dilation_w = dilation
	else:
	dilation_h, dilation_w = dilation

	batch, in_height, in_width, in_channel = Input.shape

	# shape of dilated kernel
	if kernel_layout == "HWIO":
	filter_height, filter_width, channel_multiplier, filter_channel = Filter.shape
	kernel_permutation = [0, 1, 3, 2]
	else:
	filter_height, filter_width, filter_channel, channel_multiplier = Filter.shape
	kernel_permutation = [0, 1, 2, 3]

	dilated_kernel_h = (filter_height - 1) * dilation_h + 1
	dilated_kernel_w = (filter_width - 1) * dilation_w + 1
	pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
	padding, (dilated_kernel_h, dilated_kernel_w)
	)
	out_channel = simplify(in_channel * channel_multiplier)
	out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
	out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

	# padding stage
	pad_before = [0, pad_top, pad_left, 0]
	pad_after = [0, pad_down, pad_right, 0]
	PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
	# depthconv stage
	idxdiv = tvm.tir.indexdiv
	idxmod = tvm.tir.indexmod

	di = te.reduce_axis((0, filter_height), name="di")
	dj = te.reduce_axis((0, filter_width), name="dj")
	Output = te.compute(
	(batch, out_height, out_width, out_channel),
	lambda b, i, j, c: te.sum(
	(
	PaddedInput[
	b,
	i * stride_h + di * dilation_h,
	j * stride_w + dj * dilation_w,
	idxdiv(c, channel_multiplier),
	].astype(out_dtype)
	* Filter[
	tuple(
	np.array(
	[di, dj, idxdiv(c, channel_multiplier), idxmod(c, channel_multiplier)]
	)[kernel_permutation]
	)
	].astype(out_dtype)
	),
	axis=[di, dj],
	),
	name="DepthwiseConv2d",
	tag="depthwise_conv2d_nhwc",
	)
	return Output


	def depthwise_conv2d_backward_input_nhwc(Filter, Out_grad, oshape, ishape, stride, padding):
	"""Depthwise convolution nhwc backward wrt input operator.

	Parameters
	----------
	Filter : tvm.te.Tensor
	4-D with shape [filter_height, filter_width, in_channel, channel_multiplier]

	Out_grad : tvm.te.Tensor
	4-D with shape [batch, out_height, out_width, out_channel]

	stride : tuple of two ints
	The spatial stride along height and width

	padding : int or str
	Padding size, or ['VALID', 'SAME']

	Returns
	-------
	Output : tvm.te.Tensor
	4-D with shape [batch, in_height, in_width, in_channel]
	"""
	batch, in_h, in_w, in_c = ishape
	_, out_h, out_w, out_c = oshape
	filter_h, filter_w, _, channel_multiplier = Filter.shape
	if isinstance(stride, int):
	stride_h = stride_w = stride
	else:
	stride_h, stride_w = stride

	dilated_out_grad = dilate(Out_grad, [1, stride_h, stride_w, 1], name="dilated_out_grad")

	# padding params in forward propagation
	fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple(padding, (filter_h, filter_w))
	# padding params in backward propagation
	bpad_top = filter_h - 1 - fpad_top
	bpad_bottom = (filter_h - 1 - fpad_bottom) + (stride_h - 1)
	bpad_left = filter_w - 1 - fpad_left
	bpad_right = (filter_w - 1 - fpad_right) + (stride_w - 1)

	padded_out_grad = pad(
	dilated_out_grad,
	[0, bpad_top, bpad_left, 0],
	[0, bpad_bottom, bpad_right, 0],
	name="padded_out_grad",
	)

	dh = te.reduce_axis((0, filter_h), name="dh")
	dw = te.reduce_axis((0, filter_w), name="dw")
	dc = te.reduce_axis((0, channel_multiplier), name="dc")

	In_grad = te.compute(
	(batch, in_h, in_w, in_c),
	lambda b, h, w, c: te.sum(
	padded_out_grad[b, h + dh, w + dw, c * channel_multiplier + dc]
	* Filter[filter_h - 1 - dh, filter_w - 1 - dw, c, dc],
	axis=[dh, dw, dc],
	),
	tag="depthwise_conv2d_backward_input_nhwc",
	)

	return In_grad


	def depthwise_conv2d_backward_weight_nhwc(Input, Out_grad, oshape, fshape, stride, padding):
	"""Depthwise convolution nhwc backward wrt weight operator.

	Parameters
	----------
	Input : tvm.te.Tensor
	4-D with shape [batch, in_height, in_width, in_channel]

	Out_grad : tvm.te.Tensor
	4-D with shape [batch, out_height, out_width, out_channel]

	stride : tuple of two ints
	The spatial stride along height and width

	padding : int or str
	Padding size, or ['VALID', 'SAME']

	Returns
	-------
	Output : tvm.te.Tensor
	4-D with shape [filter_height, filter_width, in_channel, channel_multiplier]
	"""
	batch, out_h, out_w, out_c = oshape
	filter_h, filter_w, _, channel_multiplier = fshape
	in_c = Input.shape[3].value
	if isinstance(stride, int):
	stride_h = stride_w = stride
	else:
	stride_h, stride_w = stride

	pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (filter_h, filter_w))

	padded_in = pad(
	Input, [0, pad_top, pad_left, 0], [0, pad_bottom, pad_right, 0], name="padded_in"
	)

	dh = te.reduce_axis((0, Out_grad.shape[1].value), name="dh")
	dw = te.reduce_axis((0, Out_grad.shape[2].value), name="dw")
	db = te.reduce_axis((0, batch), name="db")
	idxdiv = tvm.tir.indexdiv
	idxmod = tvm.tir.indexmod

	Weight_grad = te.compute(
	(filter_h, filter_w, in_c, channel_multiplier),
	lambda fh, fw, c, m: te.sum(
	Out_grad[db, dh, dw, c * channel_multiplier + idxmod(m, channel_multiplier)]
	* padded_in[db, fh + dh * stride_h, fw + dw * stride_w, c],
	axis=[db, dh, dw],
	),
	tag="depthwise_conv2d_backward_weight_nhwc",
	)

	return Weight_grad


	def depthwise_conv2d_NCHWc(
	Input, Filter, stride, padding, dilation, layout, out_layout, out_dtype=None
	):
	"""Depthwise convolution NCHW[x]c forward operator.

	Parameters
	----------
	Input : tvm.te.Tensor
	5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]

	Filter : tvm.te.Tensor
	6-D with shape [out_channel_chunk, 1, filter_height, filter_width, 1, out_channel_block]
	In NCHWc depthwise convolution,
	we group kernel's in_channel and channel_multiplier together then do the tiling.

	stride : tuple of two ints
	The spatial stride along height and width

	padding : int or str
	Padding size, or ['VALID', 'SAME']

	dilation: int or a list/tuple of two ints
	dilation size, or [dilation_height, dilation_width]

	layout : str
	Input data layout

	out_layout : str
	Output data layout

	out_dtype: str, optional
	Output data type

	Returns
	-------
	Output : tvm.te.Tensor
	5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
	"""
	raise ValueError("missing register for topi.nn.depthwise_conv2d_NCHWc")


	@tvm.target.generic_func
	def depthwise_conv2d_infer_layout(workload, cfg):
	"""Infer input/output shapes and layouts from a workload and cfg.

	Parameters
	----------
	workload : tuple
	conv2d workload

	cfg : tuple
	tvm.autotvm config

	Returns
	-------
	Output : [tuple of tuple and str, tuple of tuple and str]
	Input shapes and layouts, and output shapes and layouts
	"""
	raise ValueError("missing register for topi.nn.depthwise_conv2d_infer_layout")