python/tvm/topi/generic/conv2d.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name, unused-variable, too-many-locals
 # pylint: disable=unused-argument, redefined-builtin
 """Generic convolution schedules"""
 from tvm import te
 from tvm import autotvm
 from tvm import relay
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 from ..utils import get_const_tuple, traverse_inline
 from ..nn.utils import get_pad_tuple


 def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements):
     """Fallback schedule for conv2d int8 on cpu.
     Normally the inner most pattern takes two int8/uint8 tensors
     data[num_int8_elements] and kernel[int32_lanes, num_int8_elements],
     produces a dot product int32/uint32 output[int32_lanes].

     Parameters
     ----------
     int32_lanes : int
         How many numbers of int32/uint32 will be produced using intrinsic.
         This is related to output channel.
     num_int8_elements : int
         How many numbers of input int32/uint32 will be multiplied and reduced.
         This is related to input channel.
     """
     pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     HSTR, WSTR = wkl.stride_h, wkl.stride_w
     dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1
     out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1

     assert (
         wkl.out_filter % int32_lanes == 0
     ), f"wkl.out_filter={wkl.out_filter}, int32_lanes={int32_lanes}"
     assert (
         wkl.in_filter % num_int8_elements == 0
     ), f"wkl.in_filter={wkl.in_filter}, num_int8_elements={num_int8_elements}"

     oc_bn = int32_lanes if int32_lanes >= num_int8_elements else num_int8_elements
     ic_bn = 1
     for bn in range(oc_bn, 0, -4):
         if wkl.in_filter % bn == 0:
             ic_bn = bn
             break

     reg_n = 1
     for n in range(31, 0, -1):
         if out_width % n == 0:
             reg_n = n
             break

     cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
     cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
     cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
     cfg["unroll_kw"] = OtherOptionEntity(False)


 def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements):
     """Fallback schedule for 1x1 conv2d int8 on cpu.
     Normally the inner most pattern takes two int8/uint8 tensors
     data[num_int8_elements] and kernel[int32_lanes, num_int8_elements],
     produces a dot product int32/uint32 output[int32_lanes].

     Parameters
     ----------
     int32_lanes : int
         How many numbers of int32/uint32 will be produced using intrinsic.
         This is related to output channel.
     num_int8_elements : int
         How many numbers of input int32/uint32 will be multiplied and reduced.
         This is related to input channel.
     """
     pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
     HSTR, WSTR = wkl.stride_h, wkl.stride_w
     out_height = (wkl.height + pt + pb - wkl.kernel_h) // HSTR + 1
     out_width = (wkl.width + pl + pr - wkl.kernel_w) // WSTR + 1

     assert (
         wkl.out_filter % int32_lanes == 0
     ), f"wkl.out_filter={wkl.out_filter}, int32_lanes={int32_lanes}"
     assert (
         wkl.in_filter % num_int8_elements == 0
     ), f"wkl.in_filter={wkl.in_filter}, num_int8_elements={num_int8_elements}"

     oc_bn = int32_lanes if int32_lanes >= num_int8_elements else num_int8_elements
     ic_bn = 1
     for bn in range(oc_bn, 0, -4):
         if wkl.in_filter % bn == 0:
             ic_bn = bn
             break

     for ow_factor in range(out_width, 0, -1):
         if out_width % ow_factor == 0:
             for oh_factor in range(out_height, 0, -1):
                 if out_height % oh_factor == 0 and ow_factor * oh_factor < 32:
                     cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
                     cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
                     cfg["tile_oh"] = OtherOptionEntity(oh_factor)
                     cfg["tile_ow"] = SplitEntity([out_width // ow_factor, ow_factor])
                     return
     raise ValueError(f"cannot decide default schedule for workload: {wkl}")


 def schedule_conv_NCHWc_cpu_common_int8(
     s,
     cfg,
     data_vec,
     kernel_vec,
     conv_out,
     last,
     int32_lanes=16,
     int8_elems=4,
     intrin=None,
     inline_fused=True,
     mem_scope="global",
 ):
     """
     Defines the schedule for INT8 for Intel and ARM machines
     Uses the Intel/ARM intrinsics to use INT8 operations
     More details - https://software.intel.com/en-us/articles/
     lower-numerical-precision-deep-learning-inference-and-training
     """
     if isinstance(cfg["tile_ow"], int):
         reg_n = cfg["tile_ow"]
     else:
         reg_n = cfg["tile_ow"].size[-1]

     if isinstance(cfg["unroll_kw"], (int, bool)):
         unroll_kw = cfg["unroll_kw"]
     else:
         unroll_kw = cfg["unroll_kw"].val

     _, _, _, _, ic_bn = get_const_tuple(data_vec.shape)
     _, _, _, _, oc_bn = get_const_tuple(conv_out.shape)

     # schedule pad
     if isinstance(s[data_vec].op, te.tensor.ComputeOp) and "pad" in data_vec.op.tag:
         batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
         parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
         s[data_vec].parallel(parallel_axis)
         data_vec = data_vec.op.input_tensors[0]

     if autotvm.GLOBAL_SCOPE.in_tuning:
         # only in autotuning, input data of conv2d_NCHWc will be 4-D.
         # skip this part during tuning to make records accurate.
         # this part will be folded during Relay fold_constant pass.
         if isinstance(data_vec.op, te.tensor.ComputeOp):
             s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region")
         if isinstance(kernel_vec.op, te.tensor.ComputeOp):
             s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region")
     elif isinstance(kernel_vec.op, te.tensor.ComputeOp) and kernel_vec.name == "kernel_vec":
         # data and kernel are not pre-computed, schedule layout transform here.
         # this should only be used by x86 conv2d_nchw, which is for
         # testing purpose.
         batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
         parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
         s[data_vec].parallel(parallel_axis)

         # conv2d_nchwc_int8 has 7D kernel
         oc_chunk, ic_chunk, oh, ow, ic_block, oc_block, _ = s[kernel_vec].op.axis
         s[kernel_vec].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
         oc_bn = cfg["tile_oc"].size[-1]
         if oc_bn > 1:
             s[kernel_vec].vectorize(oc_block)
         parallel_axis = s[kernel_vec].fuse(oc_chunk, oh)
         s[kernel_vec].parallel(parallel_axis)

     # schedule 5-D NCHW[x]c conv
     C, O = conv_out, last
     CC = s.cache_write(C, mem_scope)

     batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
     ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
     s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
     parallel_axis = s[C].fuse(batch, oc_chunk, oh)
     s[C].vectorize(oc_block)
     if C == O:
         s[C].parallel(parallel_axis)

     s[CC].compute_at(s[C], parallel_axis)
     _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
     kh, kw, ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis

     ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)

     assert oc_bn % int32_lanes == 0, f"oc_bn={oc_bn} % int32_lanes={int32_lanes} != 0"
     assert (
         ic_bn % int8_elems == 0
     ), f"ic_bn={ic_bn} % int8_elems={int8_elems} != 0"  # (u)int8 elements in (u)int32

     oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes)

     if unroll_kw:
         s[CC].reorder(
             oc_chunk,
             oh,
             ow_chunk,
             ic_outer,
             kh,
             ic_f_inner,
             kw,
             ow_block,
             oc_f_inner,
             oc_s_inner,
             ic_s_inner,
         )
         s[CC].unroll(kw)
     else:
         s[CC].reorder(
             oc_chunk,
             oh,
             ow_chunk,
             ic_outer,
             kh,
             kw,
             ic_f_inner,
             ow_block,
             oc_f_inner,
             oc_s_inner,
             ic_s_inner,
         )

     if intrin is not None:
         s[CC].tensorize(oc_s_inner, intrin)
     s[CC].unroll(ow_block)
     s[CC].unroll(oc_f_inner)

     if C != O:
         out_ndim = len(s[O].op.axis)
         if out_ndim == 5:
             batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
             ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
             s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
         elif out_ndim == 4:
             batch, oc, oh, ow = s[O].op.axis
             ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
             oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
             s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
         else:
             raise ValueError(f"Unsupported output ndim: {out_ndim}")
         parallel_axis = s[O].fuse(batch, oc_chunk, oh)
         if inline_fused:
             s[C].compute_at(s[O], ow_block)
         else:
             s[C].compute_at(s[O], parallel_axis)
         s[O].vectorize(oc_block)
         s[O].parallel(parallel_axis)

     return s


 def schedule_conv_NCHWc_cpu_1x1_int8(
     s,
     cfg,
     data_vec,
     kernel_vec,
     conv_out,
     last,
     int32_lanes=16,
     int8_elems=4,
     intrin=None,
     inline_fused=False,
     mem_scope="global",
 ):
     """
     Defines the 1x1 conv schedule for INT8 for Intel and ARM machines
     Uses the Intel/ARM intrinsics to use INT8 operations
     More details - https://software.intel.com/en-us/articles/
     lower-numerical-precision-deep-learning-inference-and-training
     """
     oh_factor, ow_factor = cfg["tile_oh"].val, cfg["tile_ow"].size[-1]
     _, _, _, _, ic_bn = get_const_tuple(data_vec.shape)
     _, _, _, _, oc_bn = get_const_tuple(conv_out.shape)

     # schedule pad
     if isinstance(s[data_vec].op, te.tensor.ComputeOp) and "pad" in data_vec.op.tag:
         batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
         parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
         s[data_vec].parallel(parallel_axis)
         data_vec = data_vec.op.input_tensors[0]

     if autotvm.GLOBAL_SCOPE.in_tuning:
         # only in autotuning, input data of conv2d_NCHWc will be 4-D.
         # skip this part during tuning to make records accurate.
         # this part will be folded during Relay fold_constant pass.
         if isinstance(data_vec.op, te.tensor.ComputeOp):
             s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region")
         if isinstance(kernel_vec.op, te.tensor.ComputeOp):
             s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region")
     elif isinstance(kernel_vec.op, te.tensor.ComputeOp) and kernel_vec.name == "kernel_vec":
         # data and kernel are not pre-computed, schedule layout transform here.
         # this should only be used by x86 conv2d_nchw, which is for
         # testing purpose.
         batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
         parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
         s[data_vec].parallel(parallel_axis)

         # Conv2d int8 schedule has 7D kernel
         oc_chunk, ic_chunk, oh, ow, ic_block, oc_block, _ = s[kernel_vec].op.axis
         s[kernel_vec].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
         oc_bn = cfg["tile_oc"].size[-1]
         if oc_bn > 1:
             s[kernel_vec].vectorize(oc_block)
         parallel_axis = s[kernel_vec].fuse(oc_chunk, oh)
         s[kernel_vec].parallel(parallel_axis)

     C, O = conv_out, last
     CC = s.cache_write(C, mem_scope)

     batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
     oh_outer, oh_inner = s[C].split(oh, factor=oh_factor)
     ow_outer, ow_inner = s[C].split(ow, factor=ow_factor)
     s[C].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
     s[C].vectorize(oc_block)

     parallel_axis = s[C].fuse(batch, oc_chunk, oh_outer)
     if C == O:
         s[C].parallel(parallel_axis)
     s[CC].compute_at(s[C], parallel_axis)  # good perf on mobilenet, but not on individuals?

     _, oc_chunk, oh, ow, oc_block = s[CC].op.axis
     kh, kw, ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis

     assert oc_bn % int32_lanes == 0
     assert ic_bn % int8_elems == 0  # (u)int8 elements in (u)int32

     oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes)

     oh_outer, oh_inner = s[CC].split(oh, factor=oh_factor)
     ow_outer, ow_inner = s[CC].split(ow, factor=ow_factor)

     s[CC].reorder(
         oc_chunk,
         oh_outer,
         ow_outer,
         kh,
         kw,
         ic_outer,
         ic_f_inner,
         oh_inner,
         ow_inner,
         oc_f_inner,
         oc_s_inner,
         ic_s_inner,
     )
     s[CC].fuse(oc_chunk, oh_outer)

     if intrin is not None:
         s[CC].tensorize(oc_s_inner, intrin)
     s[CC].unroll(ow_inner)
     s[CC].unroll(oh_inner)

     if C != O:
         out_ndim = len(s[O].op.axis)
         if out_ndim == 5:
             batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
             oh_outer, oh_inner = s[O].split(oh, factor=oh_factor)
             ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
         elif out_ndim == 4:
             batch, oc, oh, ow = s[O].op.axis
             oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
             oh_outer, oh_inner = s[O].split(oh, factor=oh_factor)
             ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
         else:
             raise ValueError(f"Unsupported output ndim: {out_ndim}")

         s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
         parallel_axis = s[O].fuse(batch, oc_chunk, oh_outer)
         if inline_fused:
             s[C].compute_at(s[O], ow_inner)
         else:
             s[C].compute_at(s[O], parallel_axis)
         s[O].vectorize(oc_block)
         s[O].parallel(parallel_axis)

     return s


 def schedule_depthwise_conv2d_nhwc(outs):
     """Create schedule for depthwise conv2d in NHWC layout.
     Parameters
     ----------
     outs : list[te.tensor.Tensor]
             The output tensors.
     Returns
     -------
     s : tvm.te.schedule.Schedule
         The computation schedule for depthwise conv2d.
     """
     outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
     s = te.create_schedule([x.op for x in outs])

     def _callback(op):
         """Traverse operators from computation graph"""
         if "depthwise_conv2d_nhwc" in op.tag:
             out = outs[0]
             depthwise_conv2d_out = op.output(0)
             data_pad = depthwise_conv2d_out.op.input_tensors[0]
             s[data_pad].compute_inline()
             if depthwise_conv2d_out != out:
                 s[depthwise_conv2d_out].compute_at(s[out], s[out].op.axis[3])
             s[out].fuse(*s[out].op.axis)

     traverse_inline(s, outs[0].op, _callback)
     return s


 def conv2d_alter_int8_common(
     data,
     data_tensor,
     kernel,
     kernel_tensor,
     output_tensor,
     attrs,
     data_dtype: str,
     in_channel_vector_length: int,
     out_channel_vector_length: int,
 ):
     """
     Convert TE inputs/outputs so that they are suitable for fast Int8 instructions.

     Int8 instructions require input channels and output channels to be a
     multiple of the vector length. For input channels, we pad both the inputs
     and weights channels. For output channels, we pad the weight and
     stride_slice the output.

     Arguments
     ---------
     data: Expr
         Data Expr
     data_tensor: Tensor
         Data tensor
     kernel: Expr
         Kernel Expr
     kernel_tensor: Tensor
         Kernel tensor
     output_tensor: Tensor
         Output tensor
     attrs: Conv2dAttrs
         Attributes of the computation
     data_dtype: "int8" or "uint8"
         Desired dtype of data. Data will be converted to this dtype before the main computation.
     in_channel_vector_length: int
         Length of vector units on target hardware. Input channels are padded to this length.
     out_channel_vector_length: int
         Output size of vector instruction. Output channels are padded to this length.

     Returns
     -------
     out : Tensor
         Conv2d computation with inputs in the correct order for tensorization.
     """
     # Dilation not supported yet. Return None if dilation is not (1, 1)
     dilation = attrs.get_int_tuple("dilation")
     if not (dilation[0] == 1 and dilation[1] == 1):
         return None

     # No legalization for depthwise convolutions yet.
     groups = attrs.get_int("groups")
     if groups != 1:
         return None

     # Get the conv attrs
     new_attrs = {k: attrs[k] for k in attrs.keys()}

     padding = attrs.get_int_tuple("padding")
     kh, kw = attrs.get_int_tuple("kernel_size")
     pt, pl, pb, pr = get_pad_tuple(padding, (kh, kw))

     if data_tensor.dtype != data_dtype:
         # How to convert data to uint8
         # Original --> C = A (conv) B
         # A and B are int8
         #   C = (A + 128 - 128) (conv) B
         #   C = (A' conv B) - 128 (conv) B
         # where A' = A + 128
         # and 128 (conv) B is basically a reduce on CRS axis for weights.
         #
         # How to convert data to int8
         #   C = (A - 128 + 128) (conv) B
         #   C = (A' conv B) + 128 (conv) B
         # where A' = A - 128
         if data_dtype == "uint8":
             # shift data to uint8
             before_shift = relay.add
             after_shift = relay.subtract
             pad_value = 128
         else:
             # shift data to int8
             before_shift = relay.subtract
             after_shift = relay.add
             pad_value = -128

         if attrs["data_layout"] == "NHWC" and attrs["kernel_layout"] == "HWIO":
             adjust_shift = relay.sum(relay.cast(kernel, dtype="int32"), axis=(0, 1, 2))
             pad_width = ((0, 0), (pt, pb), (pl, pr), (0, 0))
         elif attrs["data_layout"] == "NCHW" and attrs["kernel_layout"] == "OIHW":
             pad_width = ((0, 0), (0, 0), (pt, pb), (pl, pr))
             adjust_shift = relay.sum(relay.cast(kernel, dtype="int32"), axis=(1, 2, 3))
             adjust_shift = relay.expand_dims(adjust_shift, axis=1, num_newaxis=2)
         else:
             return None

         data = relay.cast(data, "int32")
         data = before_shift(data, relay.const(128, "int32"))
         data = relay.cast(data, data_dtype)

         # Do external padding as pad value has to be 128.
         if any(padding):
             data = relay.nn.pad(data, pad_width=pad_width, pad_value=pad_value)

         new_attrs["padding"] = (0, 0)

         # Multiply 128 to adjust shift.
         adjust_shift = relay.multiply(adjust_shift, relay.const(128, "int32"))

     # Flags to remember if the expr is modified
     ic_modified = False
     oc_modified = False

     # Find the value of input and output channel.
     in_channel = -1
     out_channel = -1
     if attrs["data_layout"] == "NHWC" and attrs["kernel_layout"] == "HWIO":
         in_channel = data_tensor.shape[3].value
         out_channel = kernel_tensor.shape[3].value
     elif attrs["data_layout"] == "NCHW" and attrs["kernel_layout"] == "OIHW":
         in_channel = data_tensor.shape[1].value
         out_channel = kernel_tensor.shape[0].value
     else:
         return None

     if in_channel % in_channel_vector_length != 0:
         new_in_channel = (
             (in_channel + in_channel_vector_length) // in_channel_vector_length
         ) * in_channel_vector_length
         diff = new_in_channel - in_channel
         if attrs["data_layout"] == "NHWC" and attrs["kernel_layout"] == "HWIO":
             data = relay.nn.pad(data, pad_width=((0, 0), (0, 0), (0, 0), (0, diff)))
             kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, diff), (0, 0)))
             ic_modified = True
         elif attrs["data_layout"] == "NCHW" and attrs["kernel_layout"] == "OIHW":
             pad_width = ((0, 0), (0, diff), (0, 0), (0, 0))
             data = relay.nn.pad(data, pad_width=pad_width)
             kernel = relay.nn.pad(kernel, pad_width=pad_width)
             ic_modified = True
         else:
             return None

     new_out_channel = out_channel
     if out_channel % out_channel_vector_length != 0:
         new_out_channel = (
             (out_channel + out_channel_vector_length) // out_channel_vector_length
         ) * out_channel_vector_length
         diff = new_out_channel - out_channel
         if attrs["data_layout"] == "NHWC" and attrs["kernel_layout"] == "HWIO":
             kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, 0), (0, diff)))
             oc_modified = True
         elif attrs["data_layout"] == "NCHW" and attrs["kernel_layout"] == "OIHW":
             kernel = relay.nn.pad(kernel, pad_width=((0, diff), (0, 0), (0, 0), (0, 0)))
             oc_modified = True
         else:
             return None

     if oc_modified:
         new_attrs["channels"] = new_out_channel
         out = relay.nn.conv2d(data, kernel, **new_attrs)
         original_out_shape = [x.value for x in output_tensor.shape]
         out = relay.strided_slice(out, begin=[0, 0, 0, 0], end=original_out_shape)
     else:
         out = relay.nn.conv2d(data, kernel, **new_attrs)

     if data_tensor.dtype != data_dtype:
         out = after_shift(out, adjust_shift)

     return out
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	# pylint: disable=invalid-name, unused-variable, too-many-locals
	# pylint: disable=unused-argument, redefined-builtin
	"""Generic convolution schedules"""
	from tvm import te
	from tvm import autotvm
	from tvm import relay
	from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
	from ..utils import get_const_tuple, traverse_inline
	from ..nn.utils import get_pad_tuple


	def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements):
	"""Fallback schedule for conv2d int8 on cpu.
	Normally the inner most pattern takes two int8/uint8 tensors
	data[num_int8_elements] and kernel[int32_lanes, num_int8_elements],
	produces a dot product int32/uint32 output[int32_lanes].

	Parameters
	----------
	int32_lanes : int
	How many numbers of int32/uint32 will be produced using intrinsic.
	This is related to output channel.
	num_int8_elements : int
	How many numbers of input int32/uint32 will be multiplied and reduced.
	This is related to input channel.
	"""
	pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
	HSTR, WSTR = wkl.stride_h, wkl.stride_w
	dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1
	out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1

	assert (
	wkl.out_filter % int32_lanes == 0
	), f"wkl.out_filter={wkl.out_filter}, int32_lanes={int32_lanes}"
	assert (
	wkl.in_filter % num_int8_elements == 0
	), f"wkl.in_filter={wkl.in_filter}, num_int8_elements={num_int8_elements}"

	oc_bn = int32_lanes if int32_lanes >= num_int8_elements else num_int8_elements
	ic_bn = 1
	for bn in range(oc_bn, 0, -4):
	if wkl.in_filter % bn == 0:
	ic_bn = bn
	break

	reg_n = 1
	for n in range(31, 0, -1):
	if out_width % n == 0:
	reg_n = n
	break

	cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
	cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
	cfg["tile_ow"] = SplitEntity([out_width // reg_n, reg_n])
	cfg["unroll_kw"] = OtherOptionEntity(False)


	def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements):
	"""Fallback schedule for 1x1 conv2d int8 on cpu.
	Normally the inner most pattern takes two int8/uint8 tensors
	data[num_int8_elements] and kernel[int32_lanes, num_int8_elements],
	produces a dot product int32/uint32 output[int32_lanes].

	Parameters
	----------
	int32_lanes : int
	How many numbers of int32/uint32 will be produced using intrinsic.
	This is related to output channel.
	num_int8_elements : int
	How many numbers of input int32/uint32 will be multiplied and reduced.
	This is related to input channel.
	"""
	pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr
	HSTR, WSTR = wkl.stride_h, wkl.stride_w
	out_height = (wkl.height + pt + pb - wkl.kernel_h) // HSTR + 1
	out_width = (wkl.width + pl + pr - wkl.kernel_w) // WSTR + 1

	assert (
	wkl.out_filter % int32_lanes == 0
	), f"wkl.out_filter={wkl.out_filter}, int32_lanes={int32_lanes}"
	assert (
	wkl.in_filter % num_int8_elements == 0
	), f"wkl.in_filter={wkl.in_filter}, num_int8_elements={num_int8_elements}"

	oc_bn = int32_lanes if int32_lanes >= num_int8_elements else num_int8_elements
	ic_bn = 1
	for bn in range(oc_bn, 0, -4):
	if wkl.in_filter % bn == 0:
	ic_bn = bn
	break

	for ow_factor in range(out_width, 0, -1):
	if out_width % ow_factor == 0:
	for oh_factor in range(out_height, 0, -1):
	if out_height % oh_factor == 0 and ow_factor * oh_factor < 32:
	cfg["tile_ic"] = SplitEntity([wkl.in_filter // ic_bn, ic_bn])
	cfg["tile_oc"] = SplitEntity([wkl.out_filter // oc_bn, oc_bn])
	cfg["tile_oh"] = OtherOptionEntity(oh_factor)
	cfg["tile_ow"] = SplitEntity([out_width // ow_factor, ow_factor])
	return
	raise ValueError(f"cannot decide default schedule for workload: {wkl}")


	def schedule_conv_NCHWc_cpu_common_int8(
	s,
	cfg,
	data_vec,
	kernel_vec,
	conv_out,
	last,
	int32_lanes=16,
	int8_elems=4,
	intrin=None,
	inline_fused=True,
	mem_scope="global",
	):
	"""
	Defines the schedule for INT8 for Intel and ARM machines
	Uses the Intel/ARM intrinsics to use INT8 operations
	More details - https://software.intel.com/en-us/articles/
	lower-numerical-precision-deep-learning-inference-and-training
	"""
	if isinstance(cfg["tile_ow"], int):
	reg_n = cfg["tile_ow"]
	else:
	reg_n = cfg["tile_ow"].size[-1]

	if isinstance(cfg["unroll_kw"], (int, bool)):
	unroll_kw = cfg["unroll_kw"]
	else:
	unroll_kw = cfg["unroll_kw"].val

	_, _, _, _, ic_bn = get_const_tuple(data_vec.shape)
	_, _, _, _, oc_bn = get_const_tuple(conv_out.shape)

	# schedule pad
	if isinstance(s[data_vec].op, te.tensor.ComputeOp) and "pad" in data_vec.op.tag:
	batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
	parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
	s[data_vec].parallel(parallel_axis)
	data_vec = data_vec.op.input_tensors[0]

	if autotvm.GLOBAL_SCOPE.in_tuning:
	# only in autotuning, input data of conv2d_NCHWc will be 4-D.
	# skip this part during tuning to make records accurate.
	# this part will be folded during Relay fold_constant pass.
	if isinstance(data_vec.op, te.tensor.ComputeOp):
	s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region")
	if isinstance(kernel_vec.op, te.tensor.ComputeOp):
	s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region")
	elif isinstance(kernel_vec.op, te.tensor.ComputeOp) and kernel_vec.name == "kernel_vec":
	# data and kernel are not pre-computed, schedule layout transform here.
	# this should only be used by x86 conv2d_nchw, which is for
	# testing purpose.
	batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
	parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
	s[data_vec].parallel(parallel_axis)

	# conv2d_nchwc_int8 has 7D kernel
	oc_chunk, ic_chunk, oh, ow, ic_block, oc_block, _ = s[kernel_vec].op.axis
	s[kernel_vec].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
	oc_bn = cfg["tile_oc"].size[-1]
	if oc_bn > 1:
	s[kernel_vec].vectorize(oc_block)
	parallel_axis = s[kernel_vec].fuse(oc_chunk, oh)
	s[kernel_vec].parallel(parallel_axis)

	# schedule 5-D NCHW[x]c conv
	C, O = conv_out, last
	CC = s.cache_write(C, mem_scope)

	batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
	ow_chunk, ow_block = s[C].split(ow, factor=reg_n)
	s[C].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
	parallel_axis = s[C].fuse(batch, oc_chunk, oh)
	s[C].vectorize(oc_block)
	if C == O:
	s[C].parallel(parallel_axis)

	s[CC].compute_at(s[C], parallel_axis)
	_, oc_chunk, oh, ow, oc_block = s[CC].op.axis
	kh, kw, ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis

	ow_chunk, ow_block = s[CC].split(ow, factor=reg_n)

	assert oc_bn % int32_lanes == 0, f"oc_bn={oc_bn} % int32_lanes={int32_lanes} != 0"
	assert (
	ic_bn % int8_elems == 0
	), f"ic_bn={ic_bn} % int8_elems={int8_elems} != 0" # (u)int8 elements in (u)int32

	oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes)

	if unroll_kw:
	s[CC].reorder(
	oc_chunk,
	oh,
	ow_chunk,
	ic_outer,
	kh,
	ic_f_inner,
	kw,
	ow_block,
	oc_f_inner,
	oc_s_inner,
	ic_s_inner,
	)
	s[CC].unroll(kw)
	else:
	s[CC].reorder(
	oc_chunk,
	oh,
	ow_chunk,
	ic_outer,
	kh,
	kw,
	ic_f_inner,
	ow_block,
	oc_f_inner,
	oc_s_inner,
	ic_s_inner,
	)

	if intrin is not None:
	s[CC].tensorize(oc_s_inner, intrin)
	s[CC].unroll(ow_block)
	s[CC].unroll(oc_f_inner)

	if C != O:
	out_ndim = len(s[O].op.axis)
	if out_ndim == 5:
	batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
	ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
	s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
	elif out_ndim == 4:
	batch, oc, oh, ow = s[O].op.axis
	ow_chunk, ow_block = s[O].split(ow, factor=reg_n)
	oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
	s[O].reorder(oc_chunk, oh, ow_chunk, ow_block, oc_block)
	else:
	raise ValueError(f"Unsupported output ndim: {out_ndim}")
	parallel_axis = s[O].fuse(batch, oc_chunk, oh)
	if inline_fused:
	s[C].compute_at(s[O], ow_block)
	else:
	s[C].compute_at(s[O], parallel_axis)
	s[O].vectorize(oc_block)
	s[O].parallel(parallel_axis)

	return s


	def schedule_conv_NCHWc_cpu_1x1_int8(
	s,
	cfg,
	data_vec,
	kernel_vec,
	conv_out,
	last,
	int32_lanes=16,
	int8_elems=4,
	intrin=None,
	inline_fused=False,
	mem_scope="global",
	):
	"""
	Defines the 1x1 conv schedule for INT8 for Intel and ARM machines
	Uses the Intel/ARM intrinsics to use INT8 operations
	More details - https://software.intel.com/en-us/articles/
	lower-numerical-precision-deep-learning-inference-and-training
	"""
	oh_factor, ow_factor = cfg["tile_oh"].val, cfg["tile_ow"].size[-1]
	_, _, _, _, ic_bn = get_const_tuple(data_vec.shape)
	_, _, _, _, oc_bn = get_const_tuple(conv_out.shape)

	# schedule pad
	if isinstance(s[data_vec].op, te.tensor.ComputeOp) and "pad" in data_vec.op.tag:
	batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
	parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
	s[data_vec].parallel(parallel_axis)
	data_vec = data_vec.op.input_tensors[0]

	if autotvm.GLOBAL_SCOPE.in_tuning:
	# only in autotuning, input data of conv2d_NCHWc will be 4-D.
	# skip this part during tuning to make records accurate.
	# this part will be folded during Relay fold_constant pass.
	if isinstance(data_vec.op, te.tensor.ComputeOp):
	s[data_vec].pragma(s[data_vec].op.axis[0], "debug_skip_region")
	if isinstance(kernel_vec.op, te.tensor.ComputeOp):
	s[kernel_vec].pragma(s[kernel_vec].op.axis[0], "debug_skip_region")
	elif isinstance(kernel_vec.op, te.tensor.ComputeOp) and kernel_vec.name == "kernel_vec":
	# data and kernel are not pre-computed, schedule layout transform here.
	# this should only be used by x86 conv2d_nchw, which is for
	# testing purpose.
	batch, ic_chunk, ih, iw, ic_block = s[data_vec].op.axis
	parallel_axis = s[data_vec].fuse(batch, ic_chunk, ih)
	s[data_vec].parallel(parallel_axis)

	# Conv2d int8 schedule has 7D kernel
	oc_chunk, ic_chunk, oh, ow, ic_block, oc_block, _ = s[kernel_vec].op.axis
	s[kernel_vec].reorder(oc_chunk, oh, ic_chunk, ow, ic_block, oc_block)
	oc_bn = cfg["tile_oc"].size[-1]
	if oc_bn > 1:
	s[kernel_vec].vectorize(oc_block)
	parallel_axis = s[kernel_vec].fuse(oc_chunk, oh)
	s[kernel_vec].parallel(parallel_axis)

	C, O = conv_out, last
	CC = s.cache_write(C, mem_scope)

	batch, oc_chunk, oh, ow, oc_block = s[C].op.axis
	oh_outer, oh_inner = s[C].split(oh, factor=oh_factor)
	ow_outer, ow_inner = s[C].split(ow, factor=ow_factor)
	s[C].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
	s[C].vectorize(oc_block)

	parallel_axis = s[C].fuse(batch, oc_chunk, oh_outer)
	if C == O:
	s[C].parallel(parallel_axis)
	s[CC].compute_at(s[C], parallel_axis) # good perf on mobilenet, but not on individuals?

	_, oc_chunk, oh, ow, oc_block = s[CC].op.axis
	kh, kw, ic_outer, ic_f_inner, ic_s_inner = s[CC].op.reduce_axis

	assert oc_bn % int32_lanes == 0
	assert ic_bn % int8_elems == 0 # (u)int8 elements in (u)int32

	oc_f_inner, oc_s_inner = s[CC].split(oc_block, factor=int32_lanes)

	oh_outer, oh_inner = s[CC].split(oh, factor=oh_factor)
	ow_outer, ow_inner = s[CC].split(ow, factor=ow_factor)

	s[CC].reorder(
	oc_chunk,
	oh_outer,
	ow_outer,
	kh,
	kw,
	ic_outer,
	ic_f_inner,
	oh_inner,
	ow_inner,
	oc_f_inner,
	oc_s_inner,
	ic_s_inner,
	)
	s[CC].fuse(oc_chunk, oh_outer)

	if intrin is not None:
	s[CC].tensorize(oc_s_inner, intrin)
	s[CC].unroll(ow_inner)
	s[CC].unroll(oh_inner)

	if C != O:
	out_ndim = len(s[O].op.axis)
	if out_ndim == 5:
	batch, oc_chunk, oh, ow, oc_block = s[O].op.axis
	oh_outer, oh_inner = s[O].split(oh, factor=oh_factor)
	ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
	elif out_ndim == 4:
	batch, oc, oh, ow = s[O].op.axis
	oc_chunk, oc_block = s[O].split(oc, factor=oc_bn)
	oh_outer, oh_inner = s[O].split(oh, factor=oh_factor)
	ow_outer, ow_inner = s[O].split(ow, factor=ow_factor)
	else:
	raise ValueError(f"Unsupported output ndim: {out_ndim}")

	s[O].reorder(oc_chunk, oh_outer, ow_outer, oh_inner, ow_inner, oc_block)
	parallel_axis = s[O].fuse(batch, oc_chunk, oh_outer)
	if inline_fused:
	s[C].compute_at(s[O], ow_inner)
	else:
	s[C].compute_at(s[O], parallel_axis)
	s[O].vectorize(oc_block)
	s[O].parallel(parallel_axis)

	return s


	def schedule_depthwise_conv2d_nhwc(outs):
	"""Create schedule for depthwise conv2d in NHWC layout.
	Parameters
	----------
	outs : list[te.tensor.Tensor]
	The output tensors.
	Returns
	-------
	s : tvm.te.schedule.Schedule
	The computation schedule for depthwise conv2d.
	"""
	outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
	s = te.create_schedule([x.op for x in outs])

	def _callback(op):
	"""Traverse operators from computation graph"""
	if "depthwise_conv2d_nhwc" in op.tag:
	out = outs[0]
	depthwise_conv2d_out = op.output(0)
	data_pad = depthwise_conv2d_out.op.input_tensors[0]
	s[data_pad].compute_inline()
	if depthwise_conv2d_out != out:
	s[depthwise_conv2d_out].compute_at(s[out], s[out].op.axis[3])
	s[out].fuse(*s[out].op.axis)

	traverse_inline(s, outs[0].op, _callback)
	return s


	def conv2d_alter_int8_common(
	data,
	data_tensor,
	kernel,
	kernel_tensor,
	output_tensor,
	attrs,
	data_dtype: str,
	in_channel_vector_length: int,
	out_channel_vector_length: int,
	):
	"""
	Convert TE inputs/outputs so that they are suitable for fast Int8 instructions.

	Int8 instructions require input channels and output channels to be a
	multiple of the vector length. For input channels, we pad both the inputs
	and weights channels. For output channels, we pad the weight and
	stride_slice the output.

	Arguments
	---------
	data: Expr
	Data Expr
	data_tensor: Tensor
	Data tensor
	kernel: Expr
	Kernel Expr
	kernel_tensor: Tensor
	Kernel tensor
	output_tensor: Tensor
	Output tensor
	attrs: Conv2dAttrs
	Attributes of the computation
	data_dtype: "int8" or "uint8"
	Desired dtype of data. Data will be converted to this dtype before the main computation.
	in_channel_vector_length: int
	Length of vector units on target hardware. Input channels are padded to this length.
	out_channel_vector_length: int
	Output size of vector instruction. Output channels are padded to this length.

	Returns
	-------
	out : Tensor
	Conv2d computation with inputs in the correct order for tensorization.
	"""
	# Dilation not supported yet. Return None if dilation is not (1, 1)
	dilation = attrs.get_int_tuple("dilation")
	if not (dilation[0] == 1 and dilation[1] == 1):
	return None

	# No legalization for depthwise convolutions yet.
	groups = attrs.get_int("groups")
	if groups != 1:
	return None

	# Get the conv attrs
	new_attrs = {k: attrs[k] for k in attrs.keys()}

	padding = attrs.get_int_tuple("padding")
	kh, kw = attrs.get_int_tuple("kernel_size")
	pt, pl, pb, pr = get_pad_tuple(padding, (kh, kw))

	if data_tensor.dtype != data_dtype:
	# How to convert data to uint8
	# Original --> C = A (conv) B
	# A and B are int8
	# C = (A + 128 - 128) (conv) B
	# C = (A' conv B) - 128 (conv) B
	# where A' = A + 128
	# and 128 (conv) B is basically a reduce on CRS axis for weights.
	#
	# How to convert data to int8
	# C = (A - 128 + 128) (conv) B
	# C = (A' conv B) + 128 (conv) B
	# where A' = A - 128
	if data_dtype == "uint8":
	# shift data to uint8
	before_shift = relay.add
	after_shift = relay.subtract
	pad_value = 128
	else:
	# shift data to int8
	before_shift = relay.subtract
	after_shift = relay.add
	pad_value = -128

	if attrs["data_layout"] == "NHWC" and attrs["kernel_layout"] == "HWIO":
	adjust_shift = relay.sum(relay.cast(kernel, dtype="int32"), axis=(0, 1, 2))
	pad_width = ((0, 0), (pt, pb), (pl, pr), (0, 0))
	elif attrs["data_layout"] == "NCHW" and attrs["kernel_layout"] == "OIHW":
	pad_width = ((0, 0), (0, 0), (pt, pb), (pl, pr))
	adjust_shift = relay.sum(relay.cast(kernel, dtype="int32"), axis=(1, 2, 3))
	adjust_shift = relay.expand_dims(adjust_shift, axis=1, num_newaxis=2)
	else:
	return None

	data = relay.cast(data, "int32")
	data = before_shift(data, relay.const(128, "int32"))
	data = relay.cast(data, data_dtype)

	# Do external padding as pad value has to be 128.
	if any(padding):
	data = relay.nn.pad(data, pad_width=pad_width, pad_value=pad_value)

	new_attrs["padding"] = (0, 0)

	# Multiply 128 to adjust shift.
	adjust_shift = relay.multiply(adjust_shift, relay.const(128, "int32"))

	# Flags to remember if the expr is modified
	ic_modified = False
	oc_modified = False

	# Find the value of input and output channel.
	in_channel = -1
	out_channel = -1
	if attrs["data_layout"] == "NHWC" and attrs["kernel_layout"] == "HWIO":
	in_channel = data_tensor.shape[3].value
	out_channel = kernel_tensor.shape[3].value
	elif attrs["data_layout"] == "NCHW" and attrs["kernel_layout"] == "OIHW":
	in_channel = data_tensor.shape[1].value
	out_channel = kernel_tensor.shape[0].value
	else:
	return None

	if in_channel % in_channel_vector_length != 0:
	new_in_channel = (
	(in_channel + in_channel_vector_length) // in_channel_vector_length
	) * in_channel_vector_length
	diff = new_in_channel - in_channel
	if attrs["data_layout"] == "NHWC" and attrs["kernel_layout"] == "HWIO":
	data = relay.nn.pad(data, pad_width=((0, 0), (0, 0), (0, 0), (0, diff)))
	kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, diff), (0, 0)))
	ic_modified = True
	elif attrs["data_layout"] == "NCHW" and attrs["kernel_layout"] == "OIHW":
	pad_width = ((0, 0), (0, diff), (0, 0), (0, 0))
	data = relay.nn.pad(data, pad_width=pad_width)
	kernel = relay.nn.pad(kernel, pad_width=pad_width)
	ic_modified = True
	else:
	return None

	new_out_channel = out_channel
	if out_channel % out_channel_vector_length != 0:
	new_out_channel = (
	(out_channel + out_channel_vector_length) // out_channel_vector_length
	) * out_channel_vector_length
	diff = new_out_channel - out_channel
	if attrs["data_layout"] == "NHWC" and attrs["kernel_layout"] == "HWIO":
	kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, 0), (0, diff)))
	oc_modified = True
	elif attrs["data_layout"] == "NCHW" and attrs["kernel_layout"] == "OIHW":
	kernel = relay.nn.pad(kernel, pad_width=((0, diff), (0, 0), (0, 0), (0, 0)))
	oc_modified = True
	else:
	return None

	if oc_modified:
	new_attrs["channels"] = new_out_channel
	out = relay.nn.conv2d(data, kernel, **new_attrs)
	original_out_shape = [x.value for x in output_tensor.shape]
	out = relay.strided_slice(out, begin=[0, 0, 0, 0], end=original_out_shape)
	else:
	out = relay.nn.conv2d(data, kernel, **new_attrs)

	if data_tensor.dtype != data_dtype:
	out = after_shift(out, adjust_shift)

	return out