blob: f243c3fc2c89b3fb5751a805e8e36838f427d351 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=unused-argument, ungrouped-imports
"""Namespace for supporting Relay operators on VTA."""
from __future__ import absolute_import as _abs
import tvm
from tvm import te
from tvm import autotvm
from tvm import topi
from tvm.relay.op import op as reg
from tvm.relay.op import strategy as _strategy
from tvm.relay.op.op import OpPattern, OpStrategy
from .utils import is_packed_layout
from .vta_conv2d import conv2d_packed, schedule_conv2d_packed
from .vta_conv2d_transpose import conv2d_transpose_packed, schedule_conv2d_transpose_packed
from .vta_group_conv2d import group_conv2d_packed, schedule_group_conv2d_packed
from .vta_dense import dense_packed, schedule_dense_packed
from ..environment import get_env
ENV = get_env()
# override to force partition at copy
reg.register_pattern("copy", OpPattern.INJECTIVE, level=15)
# add clip vta strategy
def compute_clip_vta(attrs, inputs, output_type):
""" Clip operator. """
x = inputs[0]
a_min = attrs.a_min
a_max = attrs.a_max
const_min = tvm.tir.const(a_min, x.dtype)
const_max = tvm.tir.const(a_max, x.dtype)
with tvm.te.tag_scope(topi.tag.ELEMWISE):
x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
return [x]
def clip_strategy_vta(attrs, inputs, out_type, target):
strategy = OpStrategy()
strategy.add_implementation(
compute_clip_vta,
_strategy.wrap_topi_schedule(topi.generic.schedule_injective),
name="clip.vta",
)
return strategy
reg.get("clip").get_attr("FTVMStrategy").register(clip_strategy_vta, "vta")
@autotvm.register_topi_compute("add.vta")
def add_packed(cfg, lhs, rhs):
return topi.add(lhs, rhs)
@autotvm.register_topi_compute("multiply.vta")
def multiply_packed(cfg, lhs, rhs):
return topi.multiply(lhs, rhs)
def schedule_alu_packed(cfg, outs):
"""alu packed schedule"""
assert len(outs) == 1
def is_cast_op(op):
return op.name == "T_cast"
outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
output = outs[0]
s = te.create_schedule([x.op for x in outs])
te.schedule.AutoInlineInjective(s)
# other target does not support alu-only ops
if not (ENV.TARGET in ["sim", "tsim", "intelfocl"]):
return s
# only put the int-related ops to vta
if "int" in output.dtype and len(output.shape) == 6:
ewise_inputs = []
ewise_ops = []
const_ops = []
def _traverse(op):
if topi.tag.is_broadcast(op.tag):
if not op.same_as(output.op):
if not op.axis:
const_ops.append(op)
elif not is_cast_op(op):
ewise_ops.append(op)
for tensor in op.input_tensors:
if isinstance(tensor.op, tvm.te.PlaceholderOp):
ewise_inputs.append((op, tensor))
elif is_cast_op(tensor.op) and not op.same_as(output.op):
ewise_inputs.append((op, tensor))
else:
_traverse(tensor.op)
else:
for tensor in op.input_tensors:
if (not isinstance(tensor.op, tvm.te.PlaceholderOp)) and (
not is_cast_op(tensor.op)
):
_traverse(tensor.op)
op = output.op
_traverse(op)
for _, t in ewise_inputs:
if t.dtype == "float32":
return s
x_bo, x_co, x_i, x_j, x_bi, x_ci = s[output].op.axis
cfg.define_split("tile_co", x_co, num_outputs=2)
cfg.define_split("tile_h", x_i, num_outputs=2)
cfg.define_split("tile_w", x_j, num_outputs=2)
x_co0, x_co1 = cfg["tile_co"].apply(s, output, x_co)
x_i0, x_i1 = cfg["tile_h"].apply(s, output, x_i)
x_j0, x_j1 = cfg["tile_w"].apply(s, output, x_j)
s[output].reorder(x_bo, x_i0, x_co0, x_j0, x_co1, x_i1, x_j1, x_bi, x_ci)
store_pt = x_j0
for e_o in ewise_ops:
s[e_o].set_scope(ENV.acc_scope)
s[e_o].pragma(s[e_o].op.axis[0], ENV.alu)
s[e_o].compute_at(s[output], store_pt)
# cache read input
cache_read_ewise = []
for consumer, tensor in ewise_inputs:
cache_read_ewise.append(s.cache_read(tensor, ENV.acc_scope, [consumer]))
for tensor in cache_read_ewise:
if s[tensor].op.axis:
s[tensor].pragma(s[tensor].op.axis[0], ENV.dma_copy)
s[tensor].compute_at(s[output], store_pt)
for op in const_ops:
s[op].compute_inline()
s[output].pragma(x_co1, ENV.dma_copy)
return s
@autotvm.register_topi_schedule("add.vta")
def schedule_add_packed(cfg, outs):
return schedule_alu_packed(cfg, outs)
@autotvm.register_topi_schedule("multiply.vta")
def schedule_multiply_packed(cfg, outs):
return schedule_alu_packed(cfg, outs)
def add_strategy_vta(attrs, inputs, out_type, target):
strategy = OpStrategy()
strategy.add_implementation(
_strategy.wrap_topi_compute(add_packed),
_strategy.wrap_topi_schedule(schedule_add_packed),
name="add.vta",
)
return strategy
def multiply_strategy_vta(attrs, inputs, out_type, target):
strategy = OpStrategy()
strategy.add_implementation(
_strategy.wrap_topi_compute(multiply_packed),
_strategy.wrap_topi_schedule(schedule_multiply_packed),
name="multiply.vta",
)
return strategy
# other target does not support alu-only ops
if ENV.TARGET in ["sim", "intelfocl"]:
reg.get("add").get_attr("FTVMStrategy").register(add_strategy_vta, "vta")
reg.get("multiply").get_attr("FTVMStrategy").register(multiply_strategy_vta, "vta")
@_strategy.conv2d_strategy.register("vta")
def conv2d_strategy_vta(attrs, inputs, out_type, target):
"""conv2d vta strategy"""
strategy = OpStrategy()
kernel = inputs[1]
dilation = topi.utils.get_const_tuple(attrs.dilation)
groups = attrs.groups
layout = attrs.data_layout
assert dilation == (1, 1), "support for dilation limited to (1, 1)"
if is_packed_layout(layout):
if groups == 1:
assert ENV.LOG_INP_WIDTH == 3, "only support 8bit inp for now"
assert ENV.LOG_WGT_WIDTH == 3, "only support 8bit wgt for now"
assert kernel.dtype == "int8"
strategy.add_implementation(
_strategy.wrap_compute_conv2d(conv2d_packed, True),
_strategy.wrap_topi_schedule(schedule_conv2d_packed),
name="conv2d_packed.vta",
)
else: # group_conv2d
strategy.add_implementation(
_strategy.wrap_compute_conv2d(group_conv2d_packed, has_groups=True),
_strategy.wrap_topi_schedule(schedule_group_conv2d_packed),
name="group_conv2d_packed.vta",
)
return strategy
# If it's not packed, run on ARM CPU
arm_tgt = tvm.target.arm_cpu(target.model)
return _strategy.arm_cpu.conv2d_strategy_arm_cpu(attrs, inputs, out_type, arm_tgt)
@_strategy.conv2d_transpose_strategy.register("vta")
def conv2d_transpose_strategy_vta(attrs, inputs, out_type, target):
"""conv2d_transpose vta strategy"""
dilation = topi.utils.get_const_tuple(attrs.dilation)
layout = attrs.data_layout
assert dilation == (1, 1), "support for dilation limited to (1, 1)"
if is_packed_layout(layout):
strategy = OpStrategy()
strategy.add_implementation(
_strategy.wrap_compute_conv2d_transpose(conv2d_transpose_packed),
_strategy.wrap_topi_schedule(schedule_conv2d_transpose_packed),
name="conv2d_transpose_packed.vta",
)
return strategy
# If it's not packed, run on ARM CPU
arm_tgt = tvm.target.arm_cpu(target.model)
return _strategy.arm_cpu.conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, arm_tgt)
@_strategy.dense_strategy.register("vta")
def dense_strategy_vta(attrs, inputs, out_type, target):
"""dense vta strategy"""
if len(inputs[0].shape) == 4: # this implies the layout is packed
strategy = OpStrategy()
strategy.add_implementation(
_strategy.wrap_compute_dense(dense_packed),
_strategy.wrap_topi_schedule(schedule_dense_packed),
name="dense_packed.vta",
)
return strategy
# If it's not packed, run on ARM CPU
arm_tgt = tvm.target.arm_cpu(target.model)
return _strategy.x86.dense_strategy_cpu(attrs, inputs, out_type, arm_tgt)