blob: c8d51bc23c82dd4d6f048fe9bacc7d879ef48337 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Definition of ARM CPU operator strategy."""
from functools import reduce
import logging
# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
import re
from tvm import relay, topi, tir
from ....auto_scheduler import is_auto_scheduler_enabled
from ....meta_schedule import is_meta_schedule_enabled
from ....topi.generic import conv2d as conv2d_generic
from .. import op as _op
from .generic import *
logger = logging.getLogger("strategy")
@schedule_reduce.register("arm_cpu")
def schedule_reduce_cpu(attrs, outs, target):
"""schedule reduction ops for arm_cpu"""
with target:
return topi.x86.schedule_reduce(outs)
@schedule_injective.register("arm_cpu")
def schedule_injective_arm_cpu(_, outs, target):
"""schedule injective ops for arm cpu"""
with target:
return topi.arm_cpu.schedule_injective(outs)
@schedule_concatenate.register("arm_cpu")
def schedule_concatenate_arm_cpu(_, outs, target):
"""schedule concatenate for arm cpu"""
with target:
return topi.arm_cpu.schedule_concatenate(outs)
@schedule_pool.register(["arm_cpu"])
def schedule_pool_arm_cpu(attrs, outs, target):
"""schedule pooling ops arm cpu"""
layout = attrs.layout
avg_pool = isinstance(attrs, relay.op.op_attrs.AvgPool2DAttrs)
with target:
if (
avg_pool
and target.features.has_dsp
and layout in ("NCW", "NCHW")
or not avg_pool
and target.features.has_dsp
and layout in ("NWC", "NHWC")
):
return topi.arm_cpu.schedule_pool(outs, layout)
logger.warning("pool is not optimized for arm cpu.")
return topi.generic.schedule_pool(outs, layout)
def _get_padding_width(padding):
assert isinstance(padding, tuple)
if len(padding) == 2:
_, (pad_left, pad_right) = padding
else:
_, pad_left, _, pad_right = padding
return pad_left + pad_right
def _is_simd_aligned(dtype, dimensions, padding=None):
if padding:
assert len(dimensions) == len(padding)
padded_dims = (sum(x) for x in zip(dimensions, padding))
else:
padded_dims = dimensions
# Multiply all elements of padded_dims together. We can't use math.prod, as it
# does not exist in Python 3.7.
size = reduce(lambda x, y: x * y, padded_dims)
return (
(dtype == "int8" and size % 4 == 0)
or (dtype == "int16" and size % 2 == 0)
or (dtype == "int32")
)
@conv2d_strategy.register("arm_cpu")
def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
"""conv2d arm cpu strategy"""
strategy = _op.OpStrategy()
data, kernel = inputs
dilation_h, dilation_w = attrs.get_int_tuple("dilation")
stride_h, stride_w = attrs.get_int_tuple("strides")
padding = attrs.get_int_tuple("padding")
groups = attrs.groups
layout = attrs.data_layout
kernel_layout = attrs.kernel_layout
if dilation_h < 1 or dilation_w < 1:
raise ValueError("dilation should be positive value")
if groups == 1:
if layout == "NCHW":
if kernel_layout == "OIHW":
if (
topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype)
and kernel.shape[1] >= 64
):
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_int8),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_int8),
name="conv2d_nchw_int8.arm_cpu",
plevel=15,
)
else:
# ARM conv2d spatial pack schedule.
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
name="conv2d_nchw_spatial_pack.arm_cpu",
plevel=10,
)
strategy.add_implementation(
wrap_compute_conv2d(topi.x86.conv2d_nchw),
wrap_topi_schedule(topi.x86.schedule_conv2d_nchw),
name="conv2d_nchw.x86",
)
# check if winograd algorithm is applicable
_, _, kh, kw = get_const_tuple(kernel.shape)
pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (kh, kw))
is_winograd_applicable = (
"float" in data.dtype
and "float" in kernel.dtype
and kh == 3
and kw == 3
and stride_h == 1
and stride_w == 1
and dilation_h == 1
and dilation_w == 1
)
if is_winograd_applicable:
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd),
name="conv2d_nchw_winograd.arm_cpu",
plevel=5,
)
if "nnpack" in target.libs and pt == 1 and pb == 1 and pl == 1 and pr == 1:
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd_nnpack),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd_nnpack),
name="conv2d_nchw_winograd_nnpack.arm_cpu",
plevel=15,
)
elif re.match(r"OIHW\d*o", kernel_layout):
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
name="conv2d_nchw_spatial_pack.arm_cpu",
)
else:
raise RuntimeError(
"Unsupported weight layout {} for conv2d NCHW".format(kernel_layout)
)
elif layout == "HWCN":
assert kernel_layout == "HWIO"
logger.warning("conv2d_hwcn is not optimized for arm cpu.")
strategy.add_implementation(
wrap_compute_conv2d(topi.nn.conv2d_hwcn),
wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn),
name="conv2d_hwcn.generic",
)
elif layout == "NHWC":
data_width_padding = _get_padding_width(padding)
if (
target.features.has_dsp
and dilation_w == dilation_h == 1
and kernel_layout == "OHWI"
# Check SIMD alignment
and _is_simd_aligned(data.dtype, data.shape[2:], padding=(data_width_padding, 0))
and _is_simd_aligned(kernel.dtype, kernel.shape[2:])
):
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_ohwi_dsp, need_out_layout=True),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_ohwi_dsp),
name="conv2d_nhwc_ohwi_dsp.arm_cpu",
)
elif target.features.has_dsp and kernel_layout == "HWOI":
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_dsp),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_dsp),
name="conv2d_nhwc_dsp.arm_cpu",
)
elif kernel_layout == "HWIO":
has_asimd = target.features.has_asimd
has_dot_prod = target.features.has_dotprod
if has_dot_prod and data.dtype in ["int8", "uint8"]:
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_native),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_native),
name="conv2d_NHWC_quantized_native.arm_cpu",
)
if has_asimd and data.dtype in ["int8", "uint8"]:
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved),
name="conv2d_NHWC_quantized_interleaved.arm_cpu",
)
if (not has_asimd) or (data.dtype not in ["int8", "uint8"]):
# TODO(@giuseros)
# This strategy errors out for quantized data types when tuning.
# Let's use this only for non-aarch64 or non-quantized cases
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
name="conv2d_nhwc_spatial_pack.arm_cpu",
)
else:
raise RuntimeError(
"Unsupported kernel layout {} for conv2d NHWC".format(kernel_layout)
)
else:
raise RuntimeError("Unsupported conv2d layout {} for arm cpu".format(layout))
elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
if layout == "NCHW":
assert kernel_layout == "OIHW" or re.match(r"OIHW\d*o", kernel_layout)
if kernel_layout == "OIHW":
data_width_padding = _get_padding_width(padding)
if (
target.features.has_dsp
and dilation_w == dilation_h == 1
and _is_simd_aligned(data.dtype, data.shape[3:], padding=(data_width_padding,))
and _is_simd_aligned(kernel.dtype, kernel.shape[3:])
):
strategy.add_implementation(
wrap_compute_conv2d(
topi.arm_cpu.depthwise_conv2d_nchw_oihw_dsp, need_out_layout=True
),
wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw_oihw_dsp),
name="depthwise_conv2d_nchw_oihw_dsp.arm_cpu",
)
else:
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw),
wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw),
name="depthwise_conv2d_nchw.arm_cpu",
)
# TODO:
# This schedule has incorrect result on some hardware platforms (like NV Jetson TX2)
# Let us comment it out but not remove.
# see discussion:
# https://discuss.tvm.apache.org/t/autotuner-incorrect-result-after-tuning-mobilenetv2-on-arm-cpu/6088
# strategy.add_implementation(
# wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw_spatial_pack),
# wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw_spatial_pack),
# name="depthwise_conv2d_nchw_spatial_pack.arm_cpu",
# plevel=15)
# Intel x86 depthwise conv2d schedule.
channel_multiplier = get_const_tuple(inputs[1].shape)[1]
if channel_multiplier == 1 and dilation_h == 1 and dilation_w == 1:
strategy.add_implementation(
wrap_compute_conv2d(topi.x86.depthwise_conv2d_nchw),
wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_nchw),
name="depthwise_conv2d_nchw.x86",
)
elif layout == "NHWC":
assert kernel_layout == "HWOI"
if target.features.has_asimd:
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.compute_depthwise_conv2d_nhwc),
wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nhwc),
name="depthwise_conv2d_nhwc.arm_cpu",
)
# Optimized special case depthwiseConv2D operation. Requires NHWC layout,
# a HWOI kernel layout (which we rearrange to a custom layout) no dilation,
# int8/16 inputs, int32 output, and the same number of input and output channels.
# The int8 implementation DOES need the DSP unit (for SXTB16), but it is not
# possible to use the DSP unit to speed up a NHWC depthwise convolution (though
# an NCHW convolution would benefit).
elif (
dilation_w == dilation_h == 1
and kernel.shape[3] == 1 # channel_multiplier == 1
and out_type.dtype == "int32"
and (
(data.shape[3] % 4 == 0 and data.dtype == "int8" and target.features.has_dsp)
or (data.shape[3] % 2 == 0 and data.dtype == "int16")
)
and (padding != "SAME" or data.shape[1] % stride_h == data.shape[2] % stride_w == 0)
# Ideally we should check that kernel is a Relay constant, but strategy functions
# don't have access to the data needed to check this.
):
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nhwc_dsp),
wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nhwc_dsp),
name="depthwise_conv2d_nhwc_dsp.arm_cpu",
)
else:
logger.warning("depthwise_conv2d with layout NHWC is not optimized for arm cpu.")
strategy.add_implementation(
wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc, need_kernel_layout=True),
wrap_topi_schedule(conv2d_generic.schedule_depthwise_conv2d_nhwc),
name="depthwise_conv2d_nhwc.generic",
)
else:
raise RuntimeError("Unsupported depthwise_conv2d layout {} for arm cpu".format(layout))
else: # group_conv2d
if layout == "NCHW":
assert kernel_layout == "OIHW"
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.group_conv2d_nchw, has_groups=True),
wrap_topi_schedule(topi.arm_cpu.schedule_group_conv2d_nchw),
name="group_conv2d_nchw.arm_cpu",
)
elif layout == "NHWC":
assert kernel_layout == "HWIO"
logger.warning("group_conv2d with layout NHWC is not optimized for arm cpu.")
strategy.add_implementation(
wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
wrap_topi_schedule(topi.generic.schedule_group_conv2d_nhwc),
name="group_conv2d_nhwc.generic",
)
else:
raise RuntimeError("Unsupported group_conv2d layout {} for arm cpu".format(layout))
return strategy
@conv2d_NCHWc_strategy.register("arm_cpu")
def conv2d_NCHWc_strategy_arm_cpu(attrs, inputs, out_type, target):
"""conv2d_NCHWc adopted from x86"""
strategy = _op.OpStrategy()
data, kernel = inputs
if topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype):
strategy.add_implementation(
wrap_compute_conv2d(
topi.arm_cpu.conv2d_NCHWc_int8, need_data_layout=True, need_out_layout=True
),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NCHWc_int8),
name="conv2d_NCHWc_int8.arm_cpu",
)
else:
strategy.add_implementation(
wrap_compute_conv2d(topi.x86.conv2d_NCHWc, need_data_layout=True, need_out_layout=True),
wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc),
name="conv2d_NCHWc.x86",
)
return strategy
@depthwise_conv2d_NCHWc_strategy.register("arm_cpu")
def depthwise_conv2d_NCHWc_strategy_arm_cpu(attrs, inputs, out_type, target):
"""depthwise_conv2d_NCHWc adopted from x86"""
strategy = _op.OpStrategy()
strategy.add_implementation(
wrap_compute_conv2d(
topi.x86.depthwise_conv2d_NCHWc, need_data_layout=True, need_out_layout=True
),
wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_NCHWc),
name="depthwise_conv2d_NCHWc.x86",
)
return strategy
def wrap_compute_conv2d_winograd_nnpack(topi_compute):
"""wrap topi compute for conv2d_winograd NNPack"""
def _compute_conv2d_nnpack(attrs, inputs, out_type):
padding = attrs.get_int_tuple("padding")
strides = attrs.get_int_tuple("strides")
dilation = attrs.get_int_tuple("dilation")
out_dtype = attrs.get_str("out_dtype")
out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
return [topi_compute(inputs[0], inputs[1], None, strides, padding, dilation, out_dtype)]
return _compute_conv2d_nnpack
@conv2d_winograd_without_weight_transform_strategy.register("arm_cpu")
def conv2d_winograd_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_type, target):
"""conv2d_winograd_without_weight_transform arm cpu strategy"""
dilation = attrs.get_int_tuple("dilation")
groups = attrs.get_int("groups")
layout = attrs.data_layout
strides = attrs.get_int_tuple("strides")
kernel = inputs[1]
assert dilation == (1, 1), "Do not support dilate now"
assert strides == (1, 1), "Do not support strides now"
assert groups == 1, "Do not support arbitrary group number"
strategy = _op.OpStrategy()
if layout == "NCHW":
if len(kernel.shape) == 5:
pad_kh, pad_kw, _, _, _ = get_const_tuple(inputs[1].shape)
tile_size = attrs.get_int("tile_size")
kh = pad_kh - tile_size + 1
kw = pad_kw - tile_size + 1
assert kh == 3 and kw == 3
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_winograd),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_winograd),
name="conv2d_nchw_winograd.arm_cpu",
)
elif len(kernel.shape) == 4:
# kernel must be packed by winograd nnpack
assert "nnpack" in target.libs
strategy.add_implementation(
wrap_compute_conv2d_winograd_nnpack(
topi.arm_cpu.conv2d_nchw_winograd_nnpack_without_weight_transform
),
wrap_topi_schedule(
topi.arm_cpu.schedule_conv2d_nchw_winograd_nnpack_without_weight_transform
),
name="conv2d_nchw_winograd_nnpack_withou_weight_transform.arm_cpu",
plevel=15,
)
else:
raise RuntimeError("Unsupported kernel shape: {}".format(kernel.shape))
else:
raise RuntimeError(
"Unsupported conv2d_winograd_without_weight_transform layout {}".format(layout)
)
return strategy
def wrap_compute_conv2d_gemm(topi_compute):
"""wrap topi compute for conv2d_gemm"""
def _compute_conv2d_gemm(attrs, inputs, out_type):
padding = attrs.get_int_tuple("padding")
strides = attrs.get_int_tuple("strides")
dilation = attrs.get_int_tuple("dilation")
out_dtype = attrs.get_str("out_dtype")
channels = attrs["channels"]
kernel_size = attrs["kernel_size"]
out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
return [
topi_compute(
inputs[0], inputs[1], strides, padding, dilation, out_dtype, kernel_size, channels
)
]
return _compute_conv2d_gemm
@conv2d_gemm_without_weight_transform_strategy.register("arm_cpu")
def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_type, target):
"""conv2d_winograd_without_weight_transform arm cpu strategy"""
layout = attrs.data_layout
data = inputs[0]
strategy = _op.OpStrategy()
interleaved_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved_without_transform
native_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_native_without_transform
if layout == "NHWC" and data.dtype in ["int8", "uint8"]:
strategy.add_implementation(
wrap_compute_conv2d_gemm(native_compute),
wrap_topi_schedule(
topi.arm_cpu.schedule_conv2d_NHWC_quantized_native_without_transform
),
name="conv2d_NHWC_quantized_native_without_transform.arm_cpu",
)
strategy.add_implementation(
wrap_compute_conv2d_gemm(interleaved_compute),
wrap_topi_schedule(
topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved_without_transform
),
name="conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu",
)
else:
raise RuntimeError(
"Unsupported conv2d_NHWC_quantized_without_transform layout {0}"
"with datatype {1}".format(layout, data.dtype)
)
return strategy
@conv2d_transpose_strategy.register("arm_cpu")
def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target):
"""conv2d_transpose arm cpu strategy"""
layout = attrs.data_layout
dilation = get_const_tuple(attrs.dilation)
groups = attrs.groups
assert layout == "NCHW", "only support nchw for now"
assert dilation == (1, 1), "not support dilate now"
assert groups == 1, "only support groups == 1 for now"
strategy = _op.OpStrategy()
strategy.add_implementation(
wrap_compute_conv2d_transpose(topi.arm_cpu.conv2d_transpose_nchw),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_transpose_nchw),
name="conv2d_tranpose_nchw.arm_cpu",
)
return strategy
@bitserial_conv2d_strategy.register("arm_cpu")
def bitserial_conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
"""bitserial_conv2d x86 strategy"""
strategy = _op.OpStrategy()
layout = attrs.data_layout
if layout == "NCHW":
strategy.add_implementation(
wrap_compute_bitserial_conv2d(topi.x86.bitserial_conv2d_nchw),
wrap_topi_schedule(topi.x86.schedule_bitserial_conv2d_nchw),
name="bitserial_conv2d_nchw.arm_cpu",
)
elif layout == "NHWC":
strategy.add_implementation(
wrap_compute_bitserial_conv2d(topi.arm_cpu.bitserial_conv2d_nhwc),
wrap_topi_schedule(topi.arm_cpu.schedule_bitserial_conv2d_nhwc),
name="bitserial_conv2d_nhwc.arm_cpu",
)
else:
raise ValueError("Data layout {} not supported.".format(layout))
return strategy
@bitserial_dense_strategy.register("arm_cpu")
def schedule_bitserial_dense_arm_cpu(attrs, inputs, out_type, target):
"""bitserial_dense arm cpu strategy"""
strategy = _op.OpStrategy()
strategy.add_implementation(
wrap_compute_bitserial_dense(topi.arm_cpu.bitserial_dense),
wrap_topi_schedule(topi.arm_cpu.schedule_bitserial_dense),
name="bitserial_dense.arm_cpu",
)
return strategy
@dense_strategy.register(["arm_cpu"])
def schedule_dense_arm_cpu(attrs, inputs, out_type, target):
"""dense arm cpu strategy"""
strategy = _op.OpStrategy()
data, _ = inputs
if target.features.has_dsp and data.dtype in ["int8", "int16"]:
strategy.add_implementation(
wrap_compute_dense(topi.arm_cpu.dense_dsp),
wrap_topi_schedule(topi.arm_cpu.schedule_dense_dsp),
name="dense_dsp.arm_cpu",
)
else:
# For dynamic matrix-vector multiply we use a hand written kernel.
if (
isinstance(inputs[0].shape[0], (int, tir.IntImm))
and inputs[0].shape[0] == 1
and (
topi.utils.is_dynamic_shape(inputs[0].shape)
or topi.utils.is_dynamic_shape(inputs[1].shape)
)
):
strategy.add_implementation(
wrap_compute_dense(topi.x86.dense_dynamic),
wrap_topi_schedule(topi.x86.schedule_dense_dynamic),
name="dense_dynamic.x86",
plevel=20,
)
return strategy
logger.warning("dense is not optimized for arm cpu.")
strategy.add_implementation(
wrap_compute_dense(
topi.nn.dense,
need_auto_scheduler_layout=is_auto_scheduler_enabled(),
need_meta_schedule_layout=is_meta_schedule_enabled(),
),
wrap_topi_schedule(topi.generic.schedule_dense),
name="dense.generic",
)
return strategy
@conv1d_strategy.register("arm_cpu")
def conv1d_strategy_arm_cpu(attrs, inputs, out_type, target):
"""conv1d strategy"""
strategy = _op.OpStrategy()
layout = attrs.data_layout
kernel_layout = attrs.kernel_layout
dilation = get_const_tuple(attrs.dilation)
if dilation[0] < 1:
raise ValueError("dilation should be a positive value")
if kernel_layout == "WOI":
if layout == "NWC" and target.features.has_dsp:
strategy.add_implementation(
wrap_compute_conv1d(topi.arm_cpu.conv1d_nwc_dsp),
wrap_topi_schedule(topi.arm_cpu.schedule_conv1d_nwc_dsp),
name="conv1d_dsp.arm_cpu",
)
else:
raise RuntimeError(
"Unsupported kernel layout {} for conv1d {} for arm cpu.".format(
kernel_layout, layout
)
)
elif layout == "NCW":
logger.warning("conv1d with layout %s is not optimized for arm cpu.", layout)
strategy.add_implementation(
wrap_compute_conv1d(topi.nn.conv1d_ncw),
wrap_topi_schedule(topi.generic.schedule_conv1d_ncw),
name="conv1d_ncw.generic",
)
elif layout == "NWC":
logger.warning("conv1d with layout %s is not optimized for arm cpu.", layout)
strategy.add_implementation(
wrap_compute_conv1d(topi.nn.conv1d_nwc),
wrap_topi_schedule(topi.generic.schedule_conv1d_nwc),
name="conv1d_nwc.generic",
)
else:
raise RuntimeError(
"Unsupported kernel layout {} for conv1d {} for arm cpu.".format(kernel_layout, layout)
)
return strategy