| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # pylint: disable=invalid-name, unused-variable, too-many-locals |
| # pylint: disable=unused-argument, redefined-builtin |
| """Conv2D operators""" |
| from __future__ import absolute_import as _abs |
| from collections import namedtuple |
| import tvm |
| from tvm import te |
| |
| from .pad import pad |
| from .util import get_pad_tuple |
| from ..util import simplify, get_const_tuple, get_const_int, tag |
| from .winograd_util import winograd_transform_matrices |
| |
| # workload description of conv2d |
| Workload = namedtuple( |
| "Workload", |
| [ |
| "in_dtype", |
| "out_dtype", |
| "height", |
| "width", |
| "in_filter", |
| "groups", |
| "out_filter", |
| "hkernel", |
| "wkernel", |
| "hpad", |
| "wpad", |
| "hstride", |
| "wstride", |
| ], |
| ) |
| |
| |
| def conv2d(input, filter, strides, padding, dilation, layout="NCHW", out_dtype=None): |
| """Conv2D operator. |
| |
| Parameters |
| ---------- |
| input : tvm.te.Tensor |
| 4-D with shape [batch, in_channel, in_height, in_width] |
| |
| filter : tvm.te.Tensor |
| 4-D with shape [num_filter, in_channel, filter_height, filter_width] |
| |
| strides : int or a list/tuple of two ints |
| stride size, or [stride_height, stride_width] |
| |
| padding : int or a list/tuple of 2 or 4 ints |
| padding size, or |
| [pad_height, pad_width] for 2 ints, or |
| [pad_top, pad_left, pad_bottom, pad_right] for 4 ints |
| |
| dilation: int or a list/tuple of two ints |
| dilation size, or [dilation_height, dilation_width] |
| |
| layout : str |
| layout of data |
| |
| Returns |
| ------- |
| output : tvm.te.Tensor |
| 4-D with shape [batch, out_channel, out_height, out_width] |
| """ |
| # search platform specific declaration first |
| # default declaration |
| if layout == "NCHW": |
| return conv2d_nchw(input, filter, strides, padding, dilation, out_dtype) |
| if layout == "HWCN": |
| return conv2d_hwcn(input, filter, strides, padding, dilation, out_dtype) |
| if layout == "NHWC": |
| return conv2d_nhwc(input, filter, strides, padding, dilation, out_dtype) |
| raise ValueError("not support this layout {} yet".format(layout)) |
| |
| |
| @tvm.target.generic_func |
| def conv2d_legalize(attrs, inputs, types): |
| """Legalizes Conv2D op. |
| |
| Parameters |
| ---------- |
| attrs : tvm.ir.Attrs |
| Attributes of current convolution |
| inputs : list of tvm.relay.Expr |
| The args of the Relay expr to be legalized |
| types : list of types |
| List of input and output types |
| |
| Returns |
| ------- |
| result : tvm.relay.Expr |
| The legalized expr |
| """ |
| # not to change by default |
| return None |
| |
| |
| @tvm.target.generic_func |
| def conv2d_alter_layout(attrs, inputs, tinfos, out_type): |
| """Change Conv2D layout. |
| |
| Parameters |
| ---------- |
| attrs : tvm.ir.Attrs |
| Attributes of current convolution |
| inputs : tvm.relay.Expr |
| Grouped input symbols |
| tinfos : list |
| Input shape and dtype |
| out_type: type |
| The output type |
| |
| Note |
| ---- |
| Unlike other TOPI functions, this function operates on both graph level and operator level. |
| """ |
| # not to change by default |
| return None |
| |
| |
| @tvm.target.generic_func |
| def conv2d_infer_layout(workload, cfg): |
| """Infer input/output shapes and layouts from a workload and cfg. |
| |
| Parameters |
| ---------- |
| workload : tuple |
| conv2d workload |
| |
| cfg : tuple |
| tvm.autotvm config |
| |
| Returns |
| ------- |
| Output : [tuple of tuple and str, tuple of tuple and str] |
| Input shapes and layouts, and output shapes and layouts |
| """ |
| raise ValueError("missing register for topi.nn.conv2d_infer_layout") |
| |
| |
| def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"): |
| """ Get the workload structure. """ |
| if data_layout == "NCHW": |
| _, CI, IH, IW = get_const_tuple(data.shape) |
| elif data_layout == "NHWC": |
| _, IH, IW, CI = get_const_tuple(data.shape) |
| elif data_layout == "HWCN": |
| IH, IW, CI, _ = get_const_tuple(data.shape) |
| else: |
| raise ValueError("not support this layout {} yet".format(data_layout)) |
| |
| if data_layout == "NCHW": |
| CO, CIG, KH, KW = get_const_tuple(kernel.shape) |
| else: |
| KH, KW, CIG, CO = get_const_tuple(kernel.shape) |
| |
| HPAD, WPAD, _, _ = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW))) |
| GRPS = CI // CIG |
| if isinstance(stride, (tuple, list)): |
| HSTR, WSTR = stride |
| else: |
| HSTR, WSTR = stride, stride |
| assert (data.dtype == kernel.dtype) or ( |
| data.dtype == "uint8" and kernel.dtype == "int8" |
| ), "Do not support inputs with different data types now. ' \ |
| '{} vs. {}".format( |
| data.dtype, kernel.dtype |
| ) |
| return Workload(data.dtype, out_dtype, IH, IW, CI, GRPS, CO, KH, KW, HPAD, WPAD, HSTR, WSTR) |
| |
| |
| def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None): |
| """Convolution operator in NCHW layout. |
| |
| Parameters |
| ---------- |
| Input : tvm.te.Tensor |
| 4-D with shape [batch, in_channel, in_height, in_width] |
| |
| Filter : tvm.te.Tensor |
| 4-D with shape [num_filter, in_channel, filter_height, filter_width] |
| |
| stride : int or a list/tuple of two ints |
| Stride size, or [stride_height, stride_width] |
| |
| padding : int or a list/tuple of 2 or 4 ints |
| padding size, or |
| [pad_height, pad_width] for 2 ints, or |
| [pad_top, pad_left, pad_bottom, pad_right] for 4 ints |
| |
| dilation: int or a list/tuple of two ints |
| dilation size, or [dilation_height, dilation_width] |
| |
| Returns |
| ------- |
| Output : tvm.te.Tensor |
| 4-D with shape [batch, out_channel, out_height, out_width] |
| """ |
| if out_dtype is None: |
| out_dtype = Input.dtype |
| assert isinstance(stride, int) or len(stride) == 2 |
| assert isinstance(dilation, int) or len(dilation) == 2 |
| if isinstance(stride, int): |
| stride_h = stride_w = stride |
| else: |
| stride_h, stride_w = stride |
| |
| if isinstance(dilation, int): |
| dilation_h = dilation_w = dilation |
| else: |
| dilation_h, dilation_w = dilation |
| |
| batch, in_channel, in_height, in_width = Input.shape |
| num_filter, channel, kernel_h, kernel_w = Filter.shape |
| # compute the output shape |
| dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 |
| dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 |
| pad_top, pad_left, pad_down, pad_right = get_pad_tuple( |
| padding, (dilated_kernel_h, dilated_kernel_w) |
| ) |
| out_channel = num_filter |
| out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) |
| out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) |
| # compute graph |
| pad_before = [0, 0, pad_top, pad_left] |
| pad_after = [0, 0, pad_down, pad_right] |
| temp = pad(Input, pad_before, pad_after, name="pad_temp") |
| rc = te.reduce_axis((0, in_channel), name="rc") |
| ry = te.reduce_axis((0, kernel_h), name="ry") |
| rx = te.reduce_axis((0, kernel_w), name="rx") |
| return te.compute( |
| (batch, out_channel, out_height, out_width), |
| lambda nn, ff, yy, xx: te.sum( |
| temp[nn, rc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w].astype( |
| out_dtype |
| ) |
| * Filter[ff, rc, ry, rx].astype(out_dtype), |
| axis=[rc, ry, rx], |
| ), |
| tag="conv2d_nchw", |
| ) |
| |
| |
| def conv2d_hwcn(Input, Filter, stride, padding, dilation, out_dtype=None): |
| """Convolution operator in HWCN layout. |
| |
| Parameters |
| ---------- |
| Input : tvm.te.Tensor |
| 4-D with shape [in_height, in_width, in_channel, batch] |
| |
| Filter : tvm.te.Tensor |
| 4-D with shape [filter_height, filter_width, in_channel, num_filter] |
| |
| stride : int or a list/tuple of two ints |
| Stride size, or [stride_height, stride_width] |
| |
| padding : int or a list/tuple of 2 or 4 ints |
| padding size, or |
| [pad_height, pad_width] for 2 ints, or |
| [pad_top, pad_left, pad_bottom, pad_right] for 4 ints |
| |
| dilation: int or a list/tuple of two ints |
| dilation size, or [dilation_height, dilation_width] |
| |
| Returns |
| ------- |
| output : tvm.te.Tensor |
| 4-D with shape [out_height, out_width, out_channel, batch] |
| """ |
| if out_dtype is None: |
| out_dtype = Input.dtype |
| assert isinstance(stride, int) or len(stride) == 2 |
| assert isinstance(dilation, int) or len(dilation) == 2 |
| |
| if isinstance(stride, int): |
| stride_h = stride_w = stride |
| else: |
| stride_h, stride_w = stride |
| |
| if isinstance(dilation, int): |
| dilation_h = dilation_w = dilation |
| else: |
| dilation_h, dilation_w = dilation |
| |
| in_height, in_width, in_channel, batch = Input.shape |
| kernel_h, kernel_w, channel, num_filter = Filter.shape |
| # compute the output shape |
| dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 |
| dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 |
| pad_top, pad_left, pad_down, pad_right = get_pad_tuple( |
| padding, (dilated_kernel_h, dilated_kernel_w) |
| ) |
| out_channel = num_filter |
| out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) |
| out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) |
| pad_before = [pad_top, pad_left, 0, 0] |
| pad_after = [pad_down, pad_right, 0, 0] |
| PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput") |
| rc = te.reduce_axis((0, in_channel), name="rc") |
| ry = te.reduce_axis((0, kernel_h), name="ry") |
| rx = te.reduce_axis((0, kernel_w), name="rx") |
| Output = te.compute( |
| (out_height, out_width, out_channel, batch), |
| lambda yy, xx, ff, nn: te.sum( |
| PaddedInput[ |
| yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc, nn |
| ].astype(out_dtype) |
| * Filter[ry, rx, rc, ff].astype(out_dtype), |
| axis=[ry, rx, rc], |
| ), |
| name="Conv2dOutput", |
| tag="conv2d_hwcn", |
| ) |
| return Output |
| |
| |
| def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype="float32"): |
| """Convolution operator in NHWC layout. |
| |
| Parameters |
| ---------- |
| Input : tvm.te.Tensor |
| 4-D with shape [batch, in_height, in_width, in_channel] |
| |
| Filter : tvm.te.Tensor |
| 4-D with shape [filter_height, filter_width, in_channel, num_filter] |
| |
| stride : int or a list/tuple of two ints |
| Stride size, or [stride_height, stride_width] |
| |
| padding : int or a list/tuple of 2 or 4 ints |
| padding size, or |
| [pad_height, pad_width] for 2 ints, or |
| [pad_top, pad_left, pad_bottom, pad_right] for 4 ints |
| |
| dilation: int or a list/tuple of two ints |
| dilation size, or [dilation_height, dilation_width] |
| |
| Returns |
| ------- |
| output : tvm.te.Tensor |
| 4-D with shape [batch, out_height, out_width, out_channel] |
| """ |
| assert isinstance(stride, int) or len(stride) == 2 |
| assert isinstance(dilation, int) or len(dilation) == 2 |
| |
| if isinstance(stride, int): |
| stride_h = stride_w = stride |
| else: |
| stride_h, stride_w = stride |
| |
| if isinstance(dilation, int): |
| dilation_h = dilation_w = dilation |
| else: |
| dilation_h, dilation_w = dilation |
| |
| batch, in_height, in_width, in_channel = Input.shape |
| kernel_h, kernel_w, channel, num_filter = Filter.shape |
| # compute the output shape |
| dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 |
| dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 |
| pad_top, pad_left, pad_down, pad_right = get_pad_tuple( |
| padding, (dilated_kernel_h, dilated_kernel_w) |
| ) |
| out_channel = num_filter |
| out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) |
| out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) |
| pad_before = [0, pad_top, pad_left, 0] |
| pad_after = [0, pad_down, pad_right, 0] |
| PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput") |
| rc = te.reduce_axis((0, in_channel), name="rc") |
| ry = te.reduce_axis((0, kernel_h), name="ry") |
| rx = te.reduce_axis((0, kernel_w), name="rx") |
| Output = te.compute( |
| (batch, out_height, out_width, out_channel), |
| lambda nn, yy, xx, ff: te.sum( |
| PaddedInput[ |
| nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc |
| ].astype(out_dtype) |
| * Filter[ry, rx, rc, ff].astype(out_dtype), |
| axis=[ry, rx, rc], |
| ), |
| name="Conv2dOutput", |
| tag="conv2d_nhwc", |
| ) |
| return Output |
| |
| |
| def conv2d_NCHWc(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype="float32"): |
| """Conv2D operator for nChw[x]c layout. |
| |
| Parameters |
| ---------- |
| data : tvm.te.Tensor |
| 5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block] |
| |
| kernel : tvm.te.Tensor |
| 6-D with shape |
| [num_filter_chunk, in_channel_chunk, filter_height, filter_width, |
| in_channel_block, num_filter_block] |
| |
| stride : int or a list/tuple of two ints |
| stride size, or [stride_height, stride_width] |
| |
| padding : int or a list/tuple of 2 or 4 ints |
| padding size, or |
| [pad_height, pad_width] for 2 ints, or |
| [pad_top, pad_left, pad_bottom, pad_right] for 4 ints |
| |
| dilation: int or a list/tuple of two ints |
| dilation size, or [dilation_height, dilation_width] |
| |
| layout : str |
| Input data layout |
| |
| out_layout : str |
| Output data layout |
| |
| out_dtype : str |
| output data type |
| |
| Returns |
| ------- |
| output : tvm.te.Tensor |
| 5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block] |
| """ |
| |
| # layout and out_layout are not used here, |
| # we keep them for debug convenience when dumping autotvm workload |
| HSTR, WSTR = stride if isinstance(stride, (tuple, list)) else (stride, stride) |
| dilation_h, dilation_w = ( |
| dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) |
| ) |
| |
| n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape) |
| in_channel = ic_chunk * ic_bn |
| target = tvm.target.Target.current(allow_none=False) |
| oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn = get_const_tuple(kernel.shape) |
| num_filter = oc_chunk * oc_bn |
| groups = ic_chunk // ic_chunk_group |
| |
| dilated_kernel_h = (kernel_height - 1) * dilation_h + 1 |
| dilated_kernel_w = (kernel_width - 1) * dilation_w + 1 |
| |
| pad_top, pad_left, pad_down, pad_right = get_pad_tuple( |
| padding, (dilated_kernel_h, dilated_kernel_w) |
| ) |
| HPAD = pad_top + pad_down |
| WPAD = pad_left + pad_right |
| |
| # output shape |
| out_height = (ih + HPAD - dilated_kernel_h) // HSTR + 1 |
| out_width = (iw + WPAD - dilated_kernel_w) // WSTR + 1 |
| oshape = (n, oc_chunk, out_height, out_width, oc_bn) |
| pad_before = (0, 0, pad_top, pad_left, 0) |
| pad_after = (0, 0, pad_down, pad_right, 0) |
| |
| # DOPAD |
| DOPAD = HPAD != 0 or WPAD != 0 |
| if DOPAD: |
| data_pad = pad(data, pad_before, pad_after, name="data_pad") |
| else: |
| data_pad = data |
| |
| ic = te.reduce_axis((0, in_channel), name="ic") |
| kh = te.reduce_axis((0, kernel_height), name="kh") |
| kw = te.reduce_axis((0, kernel_width), name="kw") |
| |
| idxdiv = tvm.tir.indexdiv |
| idxmod = tvm.tir.indexmod |
| |
| return te.compute( |
| oshape, |
| lambda n, oc_chunk, oh, ow, oc_block: te.sum( |
| data_pad[ |
| n, |
| idxdiv(ic, ic_bn), |
| oh * HSTR + kh * dilation_h, |
| ow * WSTR + kw * dilation_w, |
| idxmod(ic, ic_bn), |
| ].astype(out_dtype) |
| * kernel[oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block], |
| axis=[ic, kh, kw], |
| ), |
| name="conv2d_NCHWc", |
| tag="conv2d_NCHWc", |
| ) |
| |
| |
| def conv2d_NCHWc_int8( |
| data, kernel, stride, padding, dilation, layout, out_layout, out_dtype="int32" |
| ): |
| """Conv2D operator for nChw[x]c layout. |
| |
| Parameters |
| ---------- |
| data : tvm.te.Tensor |
| 5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block] |
| |
| kernel : tvm.te.Tensor |
| 7-D with shape |
| [num_filter_chunk, in_channel_chunk, filter_height, filter_width, in_channel_block/4, |
| num_filter_block, 4] |
| |
| stride : int or a list/tuple of two ints |
| stride size, or [stride_height, stride_width] |
| |
| padding : int or a list/tuple of 2 or 4 ints |
| padding size, or |
| [pad_height, pad_width] for 2 ints, or |
| [pad_top, pad_left, pad_bottom, pad_right] for 4 ints |
| |
| dilation: int or a list/tuple of two ints |
| dilation size, or [dilation_height, dilation_width] |
| |
| layout : str |
| Input data layout |
| |
| out_layout : str |
| Output data layout |
| |
| out_dtype : str |
| output data type |
| |
| Returns |
| ------- |
| output : tvm.te.Tensor |
| 5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block] |
| """ |
| |
| # layout and out_layout are not used here, |
| # we keep them for debug convenience when dumping autotvm workload |
| HSTR, WSTR = stride if isinstance(stride, (tuple, list)) else (stride, stride) |
| dilation_h, dilation_w = ( |
| dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) |
| ) |
| |
| n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape) |
| in_channel = ic_chunk * ic_bn |
| oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn, _ = get_const_tuple( |
| kernel.shape |
| ) |
| num_filter = oc_chunk * oc_bn |
| groups = ic_chunk // ic_chunk_group |
| |
| dilated_kernel_h = (kernel_height - 1) * dilation_h + 1 |
| dilated_kernel_w = (kernel_width - 1) * dilation_w + 1 |
| |
| pad_top, pad_left, pad_down, pad_right = get_pad_tuple( |
| padding, (dilated_kernel_h, dilated_kernel_w) |
| ) |
| HPAD = pad_top + pad_down |
| WPAD = pad_left + pad_right |
| |
| # output shape |
| out_height = (ih + HPAD - dilated_kernel_h) // HSTR + 1 |
| out_width = (iw + WPAD - dilated_kernel_w) // WSTR + 1 |
| oshape = (n, oc_chunk, out_height, out_width, oc_bn) |
| pad_before = (0, 0, pad_top, pad_left, 0) |
| pad_after = (0, 0, pad_down, pad_right, 0) |
| |
| # DOPAD |
| DOPAD = HPAD != 0 or WPAD != 0 |
| if DOPAD: |
| data_pad = pad(data, pad_before, pad_after, name="data_pad") |
| else: |
| data_pad = data |
| |
| ic = te.reduce_axis((0, in_channel), name="ic") |
| kh = te.reduce_axis((0, kernel_height), name="kh") |
| kw = te.reduce_axis((0, kernel_width), name="kw") |
| |
| if groups == 1: |
| n_elems = 4 |
| ic_outer = te.reduce_axis((0, in_channel // ic_bn), name="ic_outer") |
| ic_f_inner = te.reduce_axis((0, ic_bn // n_elems), name="ic_f_inner") |
| ic_s_inner = te.reduce_axis((0, n_elems), name="ic_s_inner") |
| return te.compute( |
| oshape, |
| lambda n, oc_chunk, oh, ow, oc_block: te.sum( |
| data_pad[ |
| n, |
| ic_outer, |
| oh * HSTR + kh * dilation_h, |
| ow * WSTR + kw * dilation_w, |
| ic_f_inner * n_elems + ic_s_inner, |
| ].astype(out_dtype) |
| * kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner].astype( |
| out_dtype |
| ), |
| axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner], |
| ), |
| name="conv2d_NCHWc_int8", |
| tag="conv2d_NCHWc_int8", |
| ) |
| # for int8 group conv support |
| n_elems = 4 |
| ic_chunk = in_channel // ic_bn |
| ic_outer = te.reduce_axis((0, ic_chunk // groups), name="ic_outer") |
| ic_f_inner = te.reduce_axis((0, ic_bn // n_elems), name="ic_f_inner") |
| ic_s_inner = te.reduce_axis((0, n_elems), name="ic_s_inner") |
| oshape = (n, oc_chunk, out_height, out_width, oc_bn) |
| return te.compute( |
| oshape, |
| lambda n, occ, oh, ow, oc_block: te.sum( |
| data_pad[ |
| n, |
| (occ * oc_bn // (oc_chunk * oc_bn // groups)) * (ic_chunk // groups) + ic_outer, |
| oh * HSTR + kh, |
| ow * WSTR + kw, |
| ic_f_inner * n_elems + ic_s_inner, |
| ].astype(out_dtype) |
| * kernel[occ, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner].astype(out_dtype), |
| axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner], |
| ), |
| name="conv2d_NCHWc_int8", |
| tag="conv2d_NCHWc_int8", |
| ) |
| |
| |
| def conv2d_gemm_weight_transform(kernel, tile_rows, tile_cols): |
| """Weight transformation for winograd |
| |
| Parameters |
| ---------- |
| kernel: Tensor |
| The raw kernel tensor with layout "NHWC". |
| tile_rows: int |
| Tile rows of the weight transformation for ConvGemm. |
| tile_cols: int |
| Tile columns of the weight transformation for ConvGemm. |
| |
| Returns |
| ------- |
| output : tvm.te.Tensor |
| 2-D with shape [CI*KH*KW,CO] |
| """ |
| KH, KW, IC, OC = get_const_tuple(kernel.shape) |
| K = KH * KW * IC |
| N = OC |
| |
| kernel_flat = te.compute( |
| (K, N), lambda x, y: kernel[(x // IC) // KW, (x // IC) % KW, x % IC, y], "weight_flatten" |
| ) |
| |
| pad_K = 0 |
| pad_N = 0 |
| |
| if N % tile_rows != 0: |
| pad_N = tile_rows - (N % tile_rows) |
| |
| if K % tile_cols != 0: |
| pad_K = tile_cols - (K % tile_cols) |
| |
| N_padded = N + pad_N |
| K_padded = K + pad_K |
| |
| if pad_K != 0 or pad_N != 0: |
| kernel_flat = pad( |
| kernel_flat, pad_before=(0, 0), pad_after=(pad_K, pad_N), name="weight_padding" |
| ) |
| |
| return te.compute( |
| (N_padded // tile_rows, K_padded // tile_cols, tile_rows, tile_cols), |
| lambda x, y, z, w: kernel_flat[w + tile_cols * y, z + tile_rows * x], |
| name="weight_block_reshape", |
| ) |
| |
| |
| def conv2d_winograd_weight_transform(kernel, tile_size): |
| """Weight transformation for winograd |
| |
| Parameters |
| ---------- |
| kernel: Tensor |
| The raw kernel tensor with layout "NCHW". |
| tile_size: int |
| Tile size of winograd transform. e.g. 2 for F(2x2, 3x3) and 4 for F(4x4, 3x3) |
| |
| Returns |
| ------- |
| output : tvm.te.Tensor |
| 4-D with shape [alpha, alpha, CO, CI] |
| """ |
| shape = get_const_tuple(kernel.shape) |
| assert shape[2] == shape[3], "Only support NxN kernel" |
| |
| K = shape[3] |
| r = tile_size + K - 1 |
| shape = (r, r) + shape[:2] |
| |
| _, _, G = winograd_transform_matrices(tile_size, K, kernel.dtype) |
| |
| r_kh = te.reduce_axis((0, K), name="r_kh") |
| r_kw = te.reduce_axis((0, K), name="r_kw") |
| return te.compute( |
| shape, |
| lambda eps, nu, co, ci: te.sum( |
| kernel[co][ci][r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw] |
| ), |
| name="transform_weight", |
| ) |
| |
| |
| def conv2d_winograd_nnpack_weight_transform(kernel, convolution_algorithm, out_dtype): |
| """Weight transformation for winograd |
| |
| Parameters |
| ---------- |
| kernel: Tensor |
| The raw kernel tensor with layout "NCHW". Only 3x3 kernel is supported for now. |
| convolution_algorithm: int |
| The convolution algorithm for Winograd NNPACK. |
| |
| Returns |
| ------- |
| output : tvm.te.Tensor |
| 4-D with shape [alpha, alpha, CO, CI] |
| """ |
| # pylint: disable=import-outside-toplevel |
| from tvm.contrib import nnpack |
| |
| return nnpack.convolution_inference_weight_transform( |
| kernel, algorithm=convolution_algorithm, dtype=out_dtype |
| ) |
| |
| |
| def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtype=None): |
| """Group convolution operator in NCHW layout. |
| |
| Parameters |
| ---------- |
| Input : tvm.te.Tensor |
| 4-D with shape [batch, in_channel, in_height, in_width] |
| |
| Filter : tvm.te.Tensor |
| 4-D with shape [num_filter, in_channel // groups, filter_height, filter_width] |
| |
| stride : int or a list/tuple of two ints |
| Stride size, or [stride_height, stride_width] |
| |
| padding : int or a list/tuple of 2 or 4 ints |
| padding size, or |
| [pad_height, pad_width] for 2 ints, or |
| [pad_top, pad_left, pad_bottom, pad_right] for 4 ints |
| |
| dilation : int or a list/tuple of two ints |
| dilation size, or [dilation_height, dilation_width] |
| |
| groups : int |
| number of groups |
| |
| out_dtype : str |
| The output type. This is used for mixed precision. |
| |
| Returns |
| ------- |
| Output : tvm.te.Tensor |
| 4-D with shape [batch, out_channel, out_height, out_width] |
| """ |
| if out_dtype is None: |
| out_dtype = Input.dtype |
| assert isinstance(stride, int) or len(stride) == 2 |
| assert isinstance(dilation, int) or len(dilation) == 2 |
| if isinstance(stride, int): |
| stride_h = stride_w = stride |
| else: |
| stride_h, stride_w = stride |
| |
| if isinstance(dilation, int): |
| dilation_h = dilation_w = dilation |
| else: |
| dilation_h, dilation_w = dilation |
| |
| batch, in_channel, in_height, in_width = get_const_tuple(Input.shape) |
| num_filter, _, kernel_h, kernel_w = get_const_tuple(Filter.shape) |
| |
| assert in_channel % groups == 0, "input channels must divide group size" |
| assert num_filter % groups == 0, "output channels must divide group size" |
| |
| pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (kernel_h, kernel_w)) |
| # compute the output shape |
| out_channel = num_filter |
| out_height = simplify( |
| (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1 |
| ) |
| out_width = simplify( |
| (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1 |
| ) |
| # compute graph |
| pad_before = [0, 0, pad_top, pad_left] |
| pad_after = [0, 0, pad_down, pad_right] |
| temp = pad(Input, pad_before, pad_after, name="pad_temp") |
| rc = te.reduce_axis((0, in_channel // groups), name="rc") |
| ry = te.reduce_axis((0, kernel_h), name="ry") |
| rx = te.reduce_axis((0, kernel_w), name="rx") |
| return te.compute( |
| (batch, out_channel, out_height, out_width), |
| lambda nn, ff, yy, xx: te.sum( |
| temp[ |
| nn, |
| ff // (num_filter // groups) * (in_channel // groups) + rc, |
| yy * stride_h + ry * dilation_h, |
| xx * stride_w + rx * dilation_w, |
| ].astype(out_dtype) |
| * Filter[ff, rc, ry, rx].astype(out_dtype), |
| axis=[rc, ry, rx], |
| ), |
| tag="group_conv2d_nchw", |
| ) |
| |
| |
| def group_conv2d_nhwc(Input, Filter, stride, padding, dilation, groups, out_dtype=None): |
| """Group convolution operator in NHWC layout. |
| |
| Parameters |
| ---------- |
| Input : tvm.te.Tensor |
| 4-D with shape [batch, in_height, in_width, in_channel] |
| |
| Filter : tvm.te.Tensor |
| 4-D with shape [filter_height, filter_width, in_channel // groups, num_filter] |
| |
| stride : int or a list/tuple of two ints |
| Stride size, or [stride_height, stride_width] |
| |
| padding : int or a list/tuple of 2 or 4 ints |
| padding size, or |
| [pad_height, pad_width] for 2 ints, or |
| [pad_top, pad_left, pad_bottom, pad_right] for 4 ints |
| |
| dilation : int or a list/tuple of two ints |
| dilation size, or [dilation_height, dilation_width] |
| |
| groups : int |
| number of groups |
| |
| out_dtype : str |
| The output type. This is used for mixed precision. |
| |
| Returns |
| ------- |
| Output : tvm.te.Tensor |
| 4-D with shape [batch, out_height, out_width, out_channel] |
| """ |
| if out_dtype is None: |
| out_dtype = Input.dtype |
| assert isinstance(stride, int) or len(stride) == 2 |
| assert isinstance(dilation, int) or len(dilation) == 2 |
| if isinstance(stride, int): |
| stride_h = stride_w = stride |
| else: |
| stride_h, stride_w = stride |
| |
| if isinstance(dilation, int): |
| dilation_h = dilation_w = dilation |
| else: |
| dilation_h, dilation_w = dilation |
| |
| batch, in_height, in_width, in_channel = get_const_tuple(Input.shape) |
| kernel_h, kernel_w, _, num_filter = get_const_tuple(Filter.shape) |
| |
| assert in_channel % groups == 0, "input channels must divide group size" |
| assert num_filter % groups == 0, "output channels must divide group size" |
| |
| pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (kernel_h, kernel_w)) |
| # compute the output shape |
| out_channel = num_filter |
| out_height = simplify( |
| (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1 |
| ) |
| out_width = simplify( |
| (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1 |
| ) |
| # compute graph |
| pad_before = [0, pad_top, pad_left, 0] |
| pad_after = [0, pad_down, pad_right, 0] |
| temp = pad(Input, pad_before, pad_after, name="pad_temp") |
| ry = te.reduce_axis((0, kernel_h), name="ry") |
| rx = te.reduce_axis((0, kernel_w), name="rx") |
| rc = te.reduce_axis((0, in_channel // groups), name="rc") |
| return te.compute( |
| (batch, out_height, out_width, out_channel), |
| lambda nn, yy, xx, ff: te.sum( |
| temp[ |
| nn, |
| yy * stride_h + ry * dilation_h, |
| xx * stride_w + rx * dilation_w, |
| ff // (num_filter // groups) * (in_channel // groups) + rc, |
| ].astype(out_dtype) |
| * Filter[ry, rx, rc, ff].astype(out_dtype), |
| axis=[ry, rx, rc], |
| ), |
| tag="group_conv2d_nhwc", |
| ) |
| |
| |
| def unpack_NCHWc_to_nchw(packed_out, out_dtype): |
| """Unpack conv2d_NCHWc output from layout NCHWc to NCHW |
| |
| Parameters |
| ---------- |
| packed_out : tvm.te.Tensor |
| The output tensor of conv2d_NCHWc. |
| |
| out_dtype : str |
| The output dtype. |
| |
| Returns |
| ------- |
| unpacked_out : tvm.te.Tensor |
| The unpacked output tensor in NCHW layout. |
| """ |
| n, oc_chunk, oh, ow, oc_bn = get_const_tuple(packed_out.shape) |
| |
| idxmod = tvm.tir.indexmod |
| idxdiv = tvm.tir.indexdiv |
| |
| oshape = (n, oc_chunk * oc_bn, oh, ow) |
| unpacked_out = te.compute( |
| oshape, |
| lambda n, c, h, w: packed_out[n, idxdiv(c, oc_bn), h, w, idxmod(c, oc_bn)].astype( |
| out_dtype |
| ), |
| name="output_unpack", |
| tag=tag.INJECTIVE + ",unpack_nchwc", |
| ) |
| return unpacked_out |