blob: 2722af594c03585cf618da6547eb075cd069e809 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
.. _vta-mat-mult-opt:
Matrix Multiply Blocking
========================
**Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
This tutorial provides an overview on how to use TVM to map matrix
multiplication efficiently on the VTA design.
We recommend covering the :ref:`basic-mat-mult` tutorial first.
In this tutorial, we will demonstrate TVM schedule optimizations to break large
neural network operators down onto smaller blocks to achieve computation within
limited hardware accelerator resources.
"""
######################################################################
# RPC Setup
# ---------
# We start by programming the Pynq's FPGA and building its RPC runtime.
from __future__ import absolute_import, print_function
import os
import tvm
import vta
import numpy as np
from tvm import rpc
from tvm.contrib import util
from vta.testing import simulator
# Load VTA parameters from the vta/config/vta_config.json file
env = vta.get_env()
# We read the Pynq RPC host IP address and port number from the OS environment
host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))
# We configure both the bitstream and the runtime system on the Pynq
# to match the VTA configuration specified by the vta_config.json file.
if env.TARGET == "pynq":
# Make sure that TVM was compiled with RPC=1
assert tvm.runtime.enabled("rpc")
remote = rpc.connect(host, port)
# Reconfigure the JIT runtime
vta.reconfig_runtime(remote)
# Program the FPGA with a pre-compiled VTA bitstream.
# You can program the FPGA with your own custom bitstream
# by passing the path to the bitstream file instead of None.
vta.program_fpga(remote, bitstream=None)
# In simulation mode, host the RPC server locally.
elif env.TARGET in ["sim", "tsim"]:
remote = rpc.LocalSession()
######################################################################
# Computation Declaration
# -----------------------
# As a first step, we need to describe our matrix multiplication computation.
# We define the matrix multiplication as the computation one would find in a
# fully connected layer, defined by its batch size, input channels, and output
# channels.
# These have to be integer multiples of the VTA tensor shape:
# :code:`BATCH`, :code:`BLOCK_IN`, and :code:`BLOCK_OUT` respectively.
#
# We've added extra operators to the matrix multiplication that apply
# shifting and clipping to the output in order to mimic a fixed-point
# matrix multiplication followed by a rectified linear activation.
# We describe the TVM dataflow graph of the fully connected layer below:
#
# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/fc_dataflow.png
# :align: center
#
# This computation is intentionally too large to fit onto VTA's on-chip
# buffers all at once. Therefore in the scheduling phase we'll
# rely on computation blocking strategies to break the computation down into
# manageable chunks.
# Fully connected layer dimensions: 1024 x 1024
batch_size = 1
in_channels = 1024
out_channels = 1024
assert batch_size % env.BATCH == 0
assert in_channels % env.BLOCK_IN == 0
assert out_channels % env.BLOCK_OUT == 0
# Let's derive the tiled input tensor shapes
data_shape = (batch_size // env.BATCH,
in_channels // env.BLOCK_IN,
env.BATCH,
env.BLOCK_IN)
weight_shape = (out_channels // env.BLOCK_OUT,
in_channels // env.BLOCK_IN,
env.BLOCK_OUT,
env.BLOCK_IN)
output_shape = (batch_size // env.BATCH,
out_channels // env.BLOCK_OUT,
env.BATCH,
env.BLOCK_OUT)
num_ops = in_channels * out_channels * batch_size * 2
# Reduction axes
ic = tvm.reduce_axis((0, in_channels // env.BLOCK_IN), name='ic')
ic_tns = tvm.reduce_axis((0, env.BLOCK_IN), name='ic_tns')
# Input placeholder tensors
data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
weight = tvm.placeholder(weight_shape, name="weight", dtype=env.wgt_dtype)
# Copy buffers
data_buf = tvm.compute(data_shape,
lambda *i: data(*i),
"data_buf")
weight_buf = tvm.compute(weight_shape,
lambda *i: weight(*i),
"weight_buf")
# Declare matrix multiply computation
res_gemm = tvm.compute(output_shape,
lambda bo, co, bi, ci: tvm.sum(
data_buf[bo, ic, bi, ic_tns].astype(env.acc_dtype) *
weight_buf[co, ic, ci, ic_tns].astype(env.acc_dtype),
axis=[ic, ic_tns]),
name="res_gem")
# Add shift stage for fix-point normalization
res_shr = tvm.compute(output_shape,
lambda *i: res_gemm(*i) >> env.INP_WIDTH,
name="res_shr")
# Apply clipping between (0, input max value)
inp_max = (1<<(env.INP_WIDTH-1))-1
res_max = tvm.compute(output_shape,
lambda *i: tvm.max(res_shr(*i), 0),
"res_max")
res_min = tvm.compute(output_shape,
lambda *i: tvm.min(res_max(*i), inp_max),
"res_min")
# Apply typecast to input data type before sending results back
res = tvm.compute(output_shape,
lambda *i: res_min(*i).astype(env.inp_dtype),
name="res")
######################################################################
# Scheduling the Computation
# --------------------------
# We'll look at a set of schedule transformations necessary to map the
# matrix multiplications onto VTA in an efficient fashion.
# Those include:
#
# - Computation blocking
# - Lowering to VTA hardware intrinsics
# Create TVM schedule
s = tvm.create_schedule(res.op)
# Let's look at the default TVM schedule
print(tvm.lower(s, [data, weight, res], simple_mode=True))
######################################################################
# Blocking the Computation
# ~~~~~~~~~~~~~~~~~~~~~~~~
# The matrix multiplication is by default too large for activations or weights
# to fit on VTA's on-chip buffers all at once.
# We block the (1, 1024) by (1024, 1024) matrix multiplication into
# smaller (1, 256) by (256, 256) matrix multiplications so the intermediate
# tensors can fit on the accelerator's on-chip SRAM.
# This approach is similar to blocking techniques applied to CPUs and GPUs in
# order to increase cache hit rate.
#
# We perform blocking along each axes (the batch axis being untouched since
# we are performing singe-batch inference).
# We also leave the inner-most tensorization axes as-is in order to allow
# TVM to pattern-match tensorization.
# We show the outcome of blocking on the computation schedule in the diagram
# below:
#
# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/blocking.png
# :align: center
# :width: 480px
#
# .. note::
#
# The code after loop splitting and reordering is equivalent to the following
# pseudo-code. We ignore the batch axis since we are only performing single-batch
# inference in this example:
#
# .. code-block:: c
#
# for (int oc_out = 0; oc_out < 4; ++oc_out) {
# // Initialization loop
# for (int oc_inn = 0; oc_inn < 16; ++oc_inn) {
# for (int oc_tns = 0; oc_tns < 16; ++oc_tns) {
# int j = (oc_out * 16 + oc_inn) * 16 + oc_tns;
# C[0][j] = 0;
# }
# }
# for (int ic_out = 0; ic_out < 4; ++ic_out) {
# // Block loop
# for (int oc_inn = 0; oc_inn < 16; ++oc_inn) {
# for (int ic_inn = 0; ic_inn < 16; ++ic_inn) {
# // Tensorization loop
# for (int oc_tns = 0; oc_tns < 16; ++oc_tns) {
# for (int ic_tns = 0; ic_tns < 16; ++ic_tns) {
# int i = (ic_out * 16 + ic_inn) * 16 + ic_tns;
# int j = (oc_out * 16 + oc_inn) * 16 + oc_tns;
# C[0][i] = C[0][i] + A[0][i] * B[j][i];
# }
# }
# }
# }
# }
# }
# }
# Let's define tiling sizes (expressed in multiples of VTA tensor shape size)
b_block = 1 // env.BATCH
i_block = 256 // env.BLOCK_IN
o_block = 256 // env.BLOCK_OUT
# Tile the output tensor along the batch and output channel dimensions
# (since by default we are doing single batch inference, the split along
# the batch dimension has no effect)
b, oc, b_tns, oc_tns = s[res].op.axis
b_out, b_inn = s[res].split(b, b_block)
oc_out, oc_inn = s[res].split(oc, o_block)
s[res].reorder(b_out, oc_out, b_inn, oc_inn)
# Move intermediate computation into each output compute tile
s[res_gemm].compute_at(s[res], oc_out)
s[res_shr].compute_at(s[res], oc_out)
s[res_max].compute_at(s[res], oc_out)
s[res_min].compute_at(s[res], oc_out)
# Apply additional loop split along reduction axis (input channel)
b_inn, oc_inn, b_tns, oc_tns = s[res_gemm].op.axis
ic_out, ic_inn = s[res_gemm].split(ic, i_block)
# Reorder axes. We move the ic_out axis all the way out of the GEMM
# loop to block along the reduction axis
s[res_gemm].reorder(ic_out, b_inn, oc_inn, ic_inn, b_tns, oc_tns, ic_tns)
# Let's look at the current TVM schedule after blocking
print(tvm.lower(s, [data, weight, res], simple_mode=True))
######################################################################
# Lowering Copies to DMA Transfers
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Next we set the buffer scopes to the corresponding on-chip VTA SRAM buffers.
# We move the load loops into the matrix multiply computation loop to stage
# memory loads such that they fit in the on-chip SRAM buffers.
# Finally we annotate the load/store loop outer axes with the DMA copy pragma
# to perform bulk memory transfers on VTA.
# Set scope of SRAM buffers
s[data_buf].set_scope(env.inp_scope)
s[weight_buf].set_scope(env.wgt_scope)
s[res_gemm].set_scope(env.acc_scope)
s[res_shr].set_scope(env.acc_scope)
s[res_min].set_scope(env.acc_scope)
s[res_max].set_scope(env.acc_scope)
# Block data and weight cache reads
s[data_buf].compute_at(s[res_gemm], ic_out)
s[weight_buf].compute_at(s[res_gemm], ic_out)
# Use DMA copy pragma on DRAM->SRAM operations
s[data_buf].pragma(s[data_buf].op.axis[0], env.dma_copy)
s[weight_buf].pragma(s[weight_buf].op.axis[0], env.dma_copy)
# Use DMA copy pragma on SRAM->DRAM operation
# (this implies that these copies should be performed along b_inn,
# or result axis 2)
s[res].pragma(s[res].op.axis[2], env.dma_copy)
######################################################################
# Lowering Computation to VTA Compute Intrinsics
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The last phase is to lower the computation loops down to VTA hardware
# intrinsics by mapping the matrix multiplication to tensor intrinsics,
# and mapping the shift, and clipping computation to the vector ALU.
# Apply tensorization over the batch tensor tile axis
s[res_gemm].tensorize(b_tns, env.gemm)
# Add an ALU pragma over the shift and clipping operations
s[res_shr].pragma(s[res_shr].op.axis[0], env.alu)
s[res_min].pragma(s[res_min].op.axis[0], env.alu)
s[res_max].pragma(s[res_max].op.axis[0], env.alu)
# Let's look at the final lowered TVM schedule after lowering memory
# loads/stores down to DMA copy intrinsics, and the computation down to
# VTA compute intrinsics.
print(vta.lower(s, [data, weight, res], simple_mode=True))
######################################################################
# TVM Compilation and Verification
# --------------------------------
# After specifying the schedule, we can compile it into a TVM function.
# We save the module so we can send it over RPC.
# We run the function and verify it against a numpy implementation to
# ensure correctness.
# Compile the TVM module
my_gemm = vta.build(s, [data, weight, res], "ext_dev", env.target_host, name="my_gemm")
temp = util.tempdir()
my_gemm.save(temp.relpath("gemm.o"))
remote.upload(temp.relpath("gemm.o"))
f = remote.load_module("gemm.o")
# Get the remote device context
ctx = remote.ext_dev(0)
# Initialize the data and weight arrays randomly in the int range of (-128, 128]
data_np = np.random.randint(
-128, 128, size=(batch_size, in_channels)).astype(data.dtype)
weight_np = np.random.randint(
-128, 128, size=(out_channels, in_channels)).astype(weight.dtype)
# Apply packing to the data and weight arrays from a 2D to a 4D packed layout
data_packed = data_np.reshape(batch_size // env.BATCH,
env.BATCH,
in_channels // env.BLOCK_IN,
env.BLOCK_IN).transpose((0, 2, 1, 3))
weight_packed = weight_np.reshape(out_channels // env.BLOCK_OUT,
env.BLOCK_OUT,
in_channels // env.BLOCK_IN,
env.BLOCK_IN).transpose((0, 2, 1, 3))
# Format the input/output arrays with tvm.nd.array to the DLPack standard
data_nd = tvm.nd.array(data_packed, ctx)
weight_nd = tvm.nd.array(weight_packed, ctx)
res_nd = tvm.nd.array(np.zeros(output_shape).astype(res.dtype), ctx)
# Clear stats
if env.TARGET in ["sim", "tsim"]:
simulator.clear_stats()
# Invoke the module to perform the computation
f(data_nd, weight_nd, res_nd)
# Verify against numpy implementation
res_ref = np.dot(data_np.astype(env.acc_dtype),
weight_np.T.astype(env.acc_dtype))
res_ref = res_ref >> env.INP_WIDTH
res_ref = np.clip(res_ref, 0, inp_max)
res_ref = res_ref.astype(res.dtype)
res_ref = res_ref.reshape(batch_size // env.BATCH,
env.BATCH,
out_channels // env.BLOCK_OUT,
env.BLOCK_OUT).transpose((0, 2, 1, 3))
np.testing.assert_equal(res_ref, res_nd.asnumpy())
# Print stats
if env.TARGET in ["sim", "tsim"]:
sim_stats = simulator.stats()
print("Execution statistics:")
for k, v in sim_stats.items():
print("\t{:<16}: {:>16}".format(k, v))
print("Successful blocked matrix multiply test!")
######################################################################
# Summary
# -------
# This tutorial demonstrates how TVM scheduling primitives can achieve
# computation blocking for a matrix multiplication example.
# This allows us to map arbitrarily large computation onto limited
# hardware accelerator resources.
#