python/tvm/relay/op/strategy/arm_cpu.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 """Definition of ARM CPU operator strategy."""
 from functools import reduce
 import logging

 # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
 import re

 from tvm import relay, topi, tir

 from ....auto_scheduler import is_auto_scheduler_enabled
 from ....meta_schedule import is_meta_schedule_enabled
 from ....topi.generic import conv2d as conv2d_generic
 from .. import op as _op
 from .generic import *

 logger = logging.getLogger("strategy")


 @schedule_reduce.register("arm_cpu")
 def schedule_reduce_cpu(attrs, outs, target):
     """schedule reduction ops for arm_cpu"""
     with target:
         return topi.x86.schedule_reduce(outs)


 @schedule_injective.register("arm_cpu")
 def schedule_injective_arm_cpu(_, outs, target):
     """schedule injective ops for arm cpu"""
     with target:
         return topi.arm_cpu.schedule_injective(outs)


 @schedule_concatenate.register("arm_cpu")
 def schedule_concatenate_arm_cpu(_, outs, target):
     """schedule concatenate for arm cpu"""
     with target:
         return topi.arm_cpu.schedule_concatenate(outs)


 @schedule_pool.register(["arm_cpu"])
 def schedule_pool_arm_cpu(attrs, outs, target):
     """schedule pooling ops arm cpu"""
     layout = attrs.layout
     avg_pool = isinstance(attrs, relay.op.op_attrs.AvgPool2DAttrs)
     with target:
         if (
             avg_pool
             and target.features.has_dsp
             and layout in ("NCW", "NCHW")
             or not avg_pool
             and target.features.has_dsp
             and layout in ("NWC", "NHWC")
         ):
             return topi.arm_cpu.schedule_pool(outs, layout)
         logger.warning("pool is not optimized for arm cpu.")
         return topi.generic.schedule_pool(outs, layout)


 def _get_padding_width(padding):
     assert isinstance(padding, tuple)
     if len(padding) == 2:
         _, (pad_left, pad_right) = padding
     else:
         _, pad_left, _, pad_right = padding
     return pad_left + pad_right


 def _is_simd_aligned(dtype, dimensions, padding=None):
     if padding:
         assert len(dimensions) == len(padding)
         padded_dims = (sum(x) for x in zip(dimensions, padding))
     else:
         padded_dims = dimensions

     # Multiply all elements of padded_dims together. We can't use math.prod, as it
     # does not exist in Python 3.7.
     size = reduce(lambda x, y: x * y, padded_dims)
     return (
         (dtype == "int8" and size % 4 == 0)
         or (dtype == "int16" and size % 2 == 0)
         or (dtype == "int32")
     )


 @conv2d_strategy.register("arm_cpu")
 def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
     """conv2d arm cpu strategy"""
     strategy = _op.OpStrategy()
     data, kernel = inputs
     dilation_h, dilation_w = attrs.get_int_tuple("dilation")
     stride_h, stride_w = attrs.get_int_tuple("strides")
     padding = attrs.get_int_tuple("padding")
     groups = attrs.groups
     layout = attrs.data_layout
     kernel_layout = attrs.kernel_layout
     if dilation_h < 1 or dilation_w < 1:
         raise ValueError("dilation should be positive value")

     if groups == 1:
         if layout == "NCHW":
             if kernel_layout == "OIHW":
                 if (
                     topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype)
                     and kernel.shape[1] >= 64
                 ):
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_int8),
                         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_int8),
                         name="conv2d_nchw_int8.arm_cpu",
                         plevel=15,
                     )
                 else:
                     # ARM conv2d spatial pack schedule.
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack),
                         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
                         name="conv2d_nchw_spatial_pack.arm_cpu",
                         plevel=10,
                     )

                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.x86.conv2d_nchw),
                         wrap_topi_schedule(topi.x86.schedule_conv2d_nchw),
                         name="conv2d_nchw.x86",
                     )

                 # check if winograd algorithm is applicable
                 _, _, kh, kw = get_const_tuple(kernel.shape)
                 pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (kh, kw))
                 is_winograd_applicable = (
                     "float" in data.dtype
                     and "float" in kernel.dtype
                     and kh == 3
                     and kw == 3
                     and stride_h == 1
                     and stride_w == 1
                     and dilation_h == 1
                     and dilation_w == 1
                 )
                 if is_winograd_applicable:
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd),
                         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd),
                         name="conv2d_nchw_winograd.arm_cpu",
                         plevel=5,
                     )
                     if "nnpack" in target.libs and pt == 1 and pb == 1 and pl == 1 and pr == 1:
                         strategy.add_implementation(
                             wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd_nnpack),
                             wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd_nnpack),
                             name="conv2d_nchw_winograd_nnpack.arm_cpu",
                             plevel=15,
                         )
             elif re.match(r"OIHW\d*o", kernel_layout):
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack),
                     wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
                     name="conv2d_nchw_spatial_pack.arm_cpu",
                 )
             else:
                 raise RuntimeError(
                     "Unsupported weight layout {} for conv2d NCHW".format(kernel_layout)
                 )
         elif layout == "HWCN":
             assert kernel_layout == "HWIO"
             logger.warning("conv2d_hwcn is not optimized for arm cpu.")
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.conv2d_hwcn),
                 wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn),
                 name="conv2d_hwcn.generic",
             )
         elif layout == "NHWC":
             data_width_padding = _get_padding_width(padding)
             if (
                 target.features.has_dsp
                 and dilation_w == dilation_h == 1
                 and kernel_layout == "OHWI"
                 # Check SIMD alignment
                 and _is_simd_aligned(data.dtype, data.shape[2:], padding=(data_width_padding, 0))
                 and _is_simd_aligned(kernel.dtype, kernel.shape[2:])
             ):
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_ohwi_dsp, need_out_layout=True),
                     wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_ohwi_dsp),
                     name="conv2d_nhwc_ohwi_dsp.arm_cpu",
                 )
             elif target.features.has_dsp and kernel_layout == "HWOI":
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_dsp),
                     wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_dsp),
                     name="conv2d_nhwc_dsp.arm_cpu",
                 )
             elif kernel_layout == "HWIO":
                 has_asimd = target.features.has_asimd
                 has_dot_prod = target.features.has_dotprod
                 if has_dot_prod and data.dtype in ["int8", "uint8"]:
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_native),
                         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_native),
                         name="conv2d_NHWC_quantized_native.arm_cpu",
                     )
                 if has_asimd and data.dtype in ["int8", "uint8"]:
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved),
                         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved),
                         name="conv2d_NHWC_quantized_interleaved.arm_cpu",
                     )
                 if (not has_asimd) or (data.dtype not in ["int8", "uint8"]):
                     # TODO(@giuseros)
                     # This strategy errors out for quantized data types when tuning.
                     # Let's use this only for non-aarch64 or non-quantized cases
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
                         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
                         name="conv2d_nhwc_spatial_pack.arm_cpu",
                     )
             else:
                 raise RuntimeError(
                     "Unsupported kernel layout {} for conv2d NHWC".format(kernel_layout)
                 )

         else:
             raise RuntimeError("Unsupported conv2d layout {} for arm cpu".format(layout))
     elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
         if layout == "NCHW":
             assert kernel_layout == "OIHW" or re.match(r"OIHW\d*o", kernel_layout)
             if kernel_layout == "OIHW":
                 data_width_padding = _get_padding_width(padding)
                 if (
                     target.features.has_dsp
                     and dilation_w == dilation_h == 1
                     and _is_simd_aligned(data.dtype, data.shape[3:], padding=(data_width_padding,))
                     and _is_simd_aligned(kernel.dtype, kernel.shape[3:])
                 ):
                     strategy.add_implementation(
                         wrap_compute_conv2d(
                             topi.arm_cpu.depthwise_conv2d_nchw_oihw_dsp, need_out_layout=True
                         ),
                         wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw_oihw_dsp),
                         name="depthwise_conv2d_nchw_oihw_dsp.arm_cpu",
                     )
                 else:
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw),
                         wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw),
                         name="depthwise_conv2d_nchw.arm_cpu",
                     )

             # TODO:
             # This schedule has incorrect result on some hardware platforms (like NV Jetson TX2)
             # Let us comment it out but not remove.
             # see discussion:
             # https://discuss.tvm.apache.org/t/autotuner-incorrect-result-after-tuning-mobilenetv2-on-arm-cpu/6088
             # strategy.add_implementation(
             #     wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw_spatial_pack),
             #     wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw_spatial_pack),
             #     name="depthwise_conv2d_nchw_spatial_pack.arm_cpu",
             #     plevel=15)

             # Intel x86 depthwise conv2d schedule.
             channel_multiplier = get_const_tuple(inputs[1].shape)[1]
             if channel_multiplier == 1 and dilation_h == 1 and dilation_w == 1:
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.x86.depthwise_conv2d_nchw),
                     wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_nchw),
                     name="depthwise_conv2d_nchw.x86",
                 )
         elif layout == "NHWC":
             assert kernel_layout == "HWOI"
             if target.features.has_asimd:
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.arm_cpu.compute_depthwise_conv2d_nhwc),
                     wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nhwc),
                     name="depthwise_conv2d_nhwc.arm_cpu",
                 )

             # Optimized special case depthwiseConv2D operation. Requires NHWC layout,
             # a HWOI kernel layout (which we rearrange to a custom layout) no dilation,
             # int8/16 inputs, int32 output, and the same number of input and output channels.
             # The int8 implementation DOES need the DSP unit (for SXTB16), but it is not
             # possible to use the DSP unit to speed up a NHWC depthwise convolution (though
             # an NCHW convolution would benefit).

             elif (
                 dilation_w == dilation_h == 1
                 and kernel.shape[3] == 1  # channel_multiplier == 1
                 and out_type.dtype == "int32"
                 and (
                     (data.shape[3] % 4 == 0 and data.dtype == "int8" and target.features.has_dsp)
                     or (data.shape[3] % 2 == 0 and data.dtype == "int16")
                 )
                 and (padding != "SAME" or data.shape[1] % stride_h == data.shape[2] % stride_w == 0)
                 # Ideally we should check that kernel is a Relay constant, but strategy functions
                 # don't have access to the data needed to check this.
             ):
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nhwc_dsp),
                     wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nhwc_dsp),
                     name="depthwise_conv2d_nhwc_dsp.arm_cpu",
                 )

             else:
                 logger.warning("depthwise_conv2d with layout NHWC is not optimized for arm cpu.")
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc, need_kernel_layout=True),
                     wrap_topi_schedule(conv2d_generic.schedule_depthwise_conv2d_nhwc),
                     name="depthwise_conv2d_nhwc.generic",
                 )
         else:
             raise RuntimeError("Unsupported depthwise_conv2d layout {} for arm cpu".format(layout))
     else:  # group_conv2d
         if layout == "NCHW":
             assert kernel_layout == "OIHW"
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.arm_cpu.group_conv2d_nchw, has_groups=True),
                 wrap_topi_schedule(topi.arm_cpu.schedule_group_conv2d_nchw),
                 name="group_conv2d_nchw.arm_cpu",
             )
         elif layout == "NHWC":
             assert kernel_layout == "HWIO"
             logger.warning("group_conv2d with layout NHWC is not optimized for arm cpu.")
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
                 wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
                 name="group_conv2d_nhwc.generic",
             )
         else:
             raise RuntimeError("Unsupported group_conv2d layout {} for arm cpu".format(layout))
     return strategy


 @conv2d_NCHWc_strategy.register("arm_cpu")
 def conv2d_NCHWc_strategy_arm_cpu(attrs, inputs, out_type, target):
     """conv2d_NCHWc adopted from x86"""
     strategy = _op.OpStrategy()
     data, kernel = inputs
     if topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype):
         strategy.add_implementation(
             wrap_compute_conv2d(
                 topi.arm_cpu.conv2d_NCHWc_int8, need_data_layout=True, need_out_layout=True
             ),
             wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NCHWc_int8),
             name="conv2d_NCHWc_int8.arm_cpu",
         )
     else:
         strategy.add_implementation(
             wrap_compute_conv2d(topi.x86.conv2d_NCHWc, need_data_layout=True, need_out_layout=True),
             wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc),
             name="conv2d_NCHWc.x86",
         )
     return strategy


 @depthwise_conv2d_NCHWc_strategy.register("arm_cpu")
 def depthwise_conv2d_NCHWc_strategy_arm_cpu(attrs, inputs, out_type, target):
     """depthwise_conv2d_NCHWc adopted from x86"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
         wrap_compute_conv2d(
             topi.x86.depthwise_conv2d_NCHWc, need_data_layout=True, need_out_layout=True
         ),
         wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_NCHWc),
         name="depthwise_conv2d_NCHWc.x86",
     )
     return strategy


 def wrap_compute_conv2d_winograd_nnpack(topi_compute):
     """wrap topi compute for conv2d_winograd NNPack"""

     def _compute_conv2d_nnpack(attrs, inputs, out_type):
         padding = attrs.get_int_tuple("padding")
         strides = attrs.get_int_tuple("strides")
         dilation = attrs.get_int_tuple("dilation")
         out_dtype = attrs.get_str("out_dtype")
         out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
         return [topi_compute(inputs[0], inputs[1], None, strides, padding, dilation, out_dtype)]

     return _compute_conv2d_nnpack


 @conv2d_winograd_without_weight_transform_strategy.register("arm_cpu")
 def conv2d_winograd_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_type, target):
     """conv2d_winograd_without_weight_transform arm cpu strategy"""
     dilation = attrs.get_int_tuple("dilation")
     groups = attrs.get_int("groups")
     layout = attrs.data_layout
     strides = attrs.get_int_tuple("strides")
     kernel = inputs[1]
     assert dilation == (1, 1), "Do not support dilate now"
     assert strides == (1, 1), "Do not support strides now"
     assert groups == 1, "Do not support arbitrary group number"
     strategy = _op.OpStrategy()
     if layout == "NCHW":
         if len(kernel.shape) == 5:
             pad_kh, pad_kw, _, _, _ = get_const_tuple(inputs[1].shape)
             tile_size = attrs.get_int("tile_size")
             kh = pad_kh - tile_size + 1
             kw = pad_kw - tile_size + 1
             assert kh == 3 and kw == 3
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd),
                 wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd),
                 name="conv2d_nchw_winograd.arm_cpu",
             )
         elif len(kernel.shape) == 4:
             # kernel must be packed by winograd nnpack
             assert "nnpack" in target.libs
             strategy.add_implementation(
                 wrap_compute_conv2d_winograd_nnpack(
                     topi.arm_cpu.conv2d_nchw_winograd_nnpack_without_weight_transform
                 ),
                 wrap_topi_schedule(
                     topi.arm_cpu.schedule_conv2d_nchw_winograd_nnpack_without_weight_transform
                 ),
                 name="conv2d_nchw_winograd_nnpack_withou_weight_transform.arm_cpu",
                 plevel=15,
             )
         else:
             raise RuntimeError("Unsupported kernel shape: {}".format(kernel.shape))
     else:
         raise RuntimeError(
             "Unsupported conv2d_winograd_without_weight_transform layout {}".format(layout)
         )
     return strategy


 def wrap_compute_conv2d_gemm(topi_compute):
     """wrap topi compute for conv2d_gemm"""

     def _compute_conv2d_gemm(attrs, inputs, out_type):
         padding = attrs.get_int_tuple("padding")
         strides = attrs.get_int_tuple("strides")
         dilation = attrs.get_int_tuple("dilation")
         out_dtype = attrs.get_str("out_dtype")
         channels = attrs["channels"]
         kernel_size = attrs["kernel_size"]
         out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
         return [
             topi_compute(
                 inputs[0], inputs[1], strides, padding, dilation, out_dtype, kernel_size, channels
             )
         ]

     return _compute_conv2d_gemm


 @conv2d_gemm_without_weight_transform_strategy.register("arm_cpu")
 def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_type, target):
     """conv2d_winograd_without_weight_transform arm cpu strategy"""
     layout = attrs.data_layout
     data = inputs[0]
     strategy = _op.OpStrategy()

     interleaved_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved_without_transform
     native_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_native_without_transform
     if layout == "NHWC" and data.dtype in ["int8", "uint8"]:
         strategy.add_implementation(
             wrap_compute_conv2d_gemm(native_compute),
             wrap_topi_schedule(
                 topi.arm_cpu.schedule_conv2d_NHWC_quantized_native_without_transform
             ),
             name="conv2d_NHWC_quantized_native_without_transform.arm_cpu",
         )
         strategy.add_implementation(
             wrap_compute_conv2d_gemm(interleaved_compute),
             wrap_topi_schedule(
                 topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved_without_transform
             ),
             name="conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu",
         )
     else:
         raise RuntimeError(
             "Unsupported conv2d_NHWC_quantized_without_transform layout {0}"
             "with datatype {1}".format(layout, data.dtype)
         )
     return strategy


 @conv2d_transpose_strategy.register("arm_cpu")
 def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target):
     """conv2d_transpose arm cpu strategy"""
     layout = attrs.data_layout
     dilation = get_const_tuple(attrs.dilation)
     groups = attrs.groups
     assert layout == "NCHW", "only support nchw for now"
     assert dilation == (1, 1), "not support dilate now"
     assert groups == 1, "only support groups == 1 for now"
     strategy = _op.OpStrategy()
     strategy.add_implementation(
         wrap_compute_conv2d_transpose(topi.arm_cpu.conv2d_transpose_nchw),
         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_transpose_nchw),
         name="conv2d_tranpose_nchw.arm_cpu",
     )
     return strategy


 @bitserial_conv2d_strategy.register("arm_cpu")
 def bitserial_conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
     """bitserial_conv2d x86 strategy"""
     strategy = _op.OpStrategy()
     layout = attrs.data_layout
     if layout == "NCHW":
         strategy.add_implementation(
             wrap_compute_bitserial_conv2d(topi.x86.bitserial_conv2d_nchw),
             wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nchw),
             name="bitserial_conv2d_nchw.arm_cpu",
         )
     elif layout == "NHWC":
         strategy.add_implementation(
             wrap_compute_bitserial_conv2d(topi.arm_cpu.bitserial_conv2d_nhwc),
             wrap_topi_schedule(topi.arm_cpu.schedule_bitserial_conv2d_nhwc),
             name="bitserial_conv2d_nhwc.arm_cpu",
         )
     else:
         raise ValueError("Data layout {} not supported.".format(layout))
     return strategy


 @bitserial_dense_strategy.register("arm_cpu")
 def schedule_bitserial_dense_arm_cpu(attrs, inputs, out_type, target):
     """bitserial_dense arm cpu strategy"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
         wrap_compute_bitserial_dense(topi.arm_cpu.bitserial_dense),
         wrap_topi_schedule(topi.arm_cpu.schedule_bitserial_dense),
         name="bitserial_dense.arm_cpu",
     )
     return strategy


 @dense_strategy.register(["arm_cpu"])
 def schedule_dense_arm_cpu(attrs, inputs, out_type, target):
     """dense arm cpu strategy"""
     strategy = _op.OpStrategy()
     data, _ = inputs

     if target.features.has_dsp and data.dtype in ["int8", "int16"]:
         strategy.add_implementation(
             wrap_compute_dense(topi.arm_cpu.dense_dsp),
             wrap_topi_schedule(topi.arm_cpu.schedule_dense_dsp),
             name="dense_dsp.arm_cpu",
         )
     else:
         # For dynamic matrix-vector multiply we use a hand written kernel.
         if (
             isinstance(inputs[0].shape[0], (int, tir.IntImm))
             and inputs[0].shape[0] == 1
             and (
                 topi.utils.is_dynamic_shape(inputs[0].shape)
                 or topi.utils.is_dynamic_shape(inputs[1].shape)
             )
         ):
             strategy.add_implementation(
                 wrap_compute_dense(topi.x86.dense_dynamic),
                 wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
                 name="dense_dynamic.x86",
                 plevel=20,
             )
             return strategy
         logger.warning("dense is not optimized for arm cpu.")
         strategy.add_implementation(
             wrap_compute_dense(
                 topi.nn.dense,
                 need_auto_scheduler_layout=is_auto_scheduler_enabled(),
                 need_meta_schedule_layout=is_meta_schedule_enabled(),
             ),
             wrap_topi_schedule(topi.generic.schedule_dense),
             name="dense.generic",
         )
     return strategy


 @conv1d_strategy.register("arm_cpu")
 def conv1d_strategy_arm_cpu(attrs, inputs, out_type, target):
     """conv1d strategy"""
     strategy = _op.OpStrategy()
     layout = attrs.data_layout
     kernel_layout = attrs.kernel_layout
     dilation = get_const_tuple(attrs.dilation)
     if dilation[0] < 1:
         raise ValueError("dilation should be a positive value")

     if kernel_layout == "WOI":
         if layout == "NWC" and target.features.has_dsp:
             strategy.add_implementation(
                 wrap_compute_conv1d(topi.arm_cpu.conv1d_nwc_dsp),
                 wrap_topi_schedule(topi.arm_cpu.schedule_conv1d_nwc_dsp),
                 name="conv1d_dsp.arm_cpu",
             )
         else:
             raise RuntimeError(
                 "Unsupported kernel layout {} for conv1d {} for arm cpu.".format(
                     kernel_layout, layout
                 )
             )
     elif layout == "NCW":
         logger.warning("conv1d with layout %s is not optimized for arm cpu.", layout)
         strategy.add_implementation(
             wrap_compute_conv1d(topi.nn.conv1d_ncw),
             wrap_topi_schedule(topi.generic.schedule_conv1d_ncw),
             name="conv1d_ncw.generic",
         )
     elif layout == "NWC":
         logger.warning("conv1d with layout %s is not optimized for arm cpu.", layout)
         strategy.add_implementation(
             wrap_compute_conv1d(topi.nn.conv1d_nwc),
             wrap_topi_schedule(topi.generic.schedule_conv1d_nwc),
             name="conv1d_nwc.generic",
         )
     else:
         raise RuntimeError(
             "Unsupported kernel layout {} for conv1d {} for arm cpu.".format(kernel_layout, layout)
         )
     return strategy