tutorials/matrix_multiply.py - tvm-vta - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 """
 .. _basic-mat-mult:

 Simple Matrix Multiply
 ======================
 **Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_

 In this tutorial, we will build on top of the :ref:`vta-get-started` tutorial
 and introduce additional concepts required to implement matrix multiplication
 on VTA with the TVM workflow.
 """

 ######################################################################
 # RPC Setup
 # ---------
 # We start by programming the Pynq's FPGA and building its RPC runtime
 # as we did in the VTA introductory tutorial.

 from __future__ import absolute_import, print_function

 import os
 import tvm
 import vta
 import numpy as np
 from tvm import rpc
 from tvm.contrib import util
 from vta.testing import simulator

 # Load VTA parameters from the vta/config/vta_config.json file
 env = vta.get_env()

 # We read the Pynq RPC host IP address and port number from the OS environment
 host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
 port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))

 # We configure both the bitstream and the runtime system on the Pynq
 # to match the VTA configuration specified by the vta_config.json file.
 if env.TARGET == "pynq":

     # Make sure that TVM was compiled with RPC=1
     assert tvm.runtime.enabled("rpc")
     remote = rpc.connect(host, port)

     # Reconfigure the JIT runtime
     vta.reconfig_runtime(remote)

     # Program the FPGA with a pre-compiled VTA bitstream.
     # You can program the FPGA with your own custom bitstream
     # by passing the path to the bitstream file instead of None.
     vta.program_fpga(remote, bitstream=None)

 # In simulation mode, host the RPC server locally.
 elif env.TARGET in ["sim", "tsim"]:
     remote = rpc.LocalSession()

 ######################################################################
 # Computation Declaration
 # -----------------------
 # In this example we describe a simple matrix multiplication addition, which
 # requires multiple computation stages, as shown in the dataflow diagram below.
 # First we describe the input tensors :code:`A` and :code:`B` that are living
 # in main memory.
 # Second, we need to declare intermediate tensors :code:`A_buf` and
 # :code:`B_buf`, which will live in VTA's on-chip buffers.
 # Having this extra computational stage allows us to explicitly
 # stage cached reads and writes.
 # Third, we describe the matrix multiplication computation over
 # :code:`A_buf` and :code:`B_buf` to produce the product matrix :code:`C_buf`.
 # The last operation is a cast and copy back to DRAM, into results tensor
 # :code:`C`.
 #
 # .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/gemm_dataflow.png
 #      :align: center

 ######################################################################
 # Data Layout
 # ~~~~~~~~~~~
 # We describe the placeholder tensors :code:`A`, and :code:`B` in a tiled data
 # format to match the data layout requirements imposed by the VTA tensor core.

 ######################################################################
 # .. note::
 #
 #   **Data Tiling**
 #
 #   One source of complexity when targeting accelerators is to make sure
 #   that the data layout matches the layout imposed by the accelerator design.
 #   VTA is designed around a *tensor core* that performs, one matrix-matrix
 #   operation per cycle between an activation matrix and a weight matrix,
 #   adding the result matrix to an accumulator matrix, as shown in the
 #   figure below.
 #
 #   .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/tensor_core.png
 #        :align: center
 #        :width: 480px
 #
 #   The dimensions of that matrix-matrix multiplication are specified in
 #   the :code:`vta_config.json` configuration file.
 #   The activation matrix has a :code:`(BATCH, BLOCK_IN)` shape
 #   and the transposed weight matrix has a :code:`(BLOCK_OUT, BLOCK_IN)` shape,
 #   thus inferring that the resulting output matrix has a
 #   :code:`(BATCH, BLOCK_OUT)` shape.
 #   Consequently input and output tensors processed by VTA need to be
 #   tiled according to these aforementioned dimension.
 #
 #   The diagram below shows the impact of data tiling on a matrix that is
 #   originally of shape (4, 8).
 #   Tiling by a (2, 2) tile shape ensures that data within each tile is
 #   contiguous.
 #   The resulting tiled tensor has a shape of (2, 4, 2, 2).
 #
 #   .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/data_tiling.png
 #        :align: center
 #        :width: 480px
 #
 # We first define the variables :code:`m`, :code:`n`, :code:`o` to represent
 # the shape of the matrix multiplication. These variables are multiplicative
 # factors over the :code:`BLOCK_OUT`, :code:`BLOCK_IN`, and :code:`BATCH`
 # tensor dimensions respectively.
 # By default, the configuration file sets :code:`BATCH`, :code:`BLOCK_IN`, and
 # :code:`BLOCK_OUT` to be 1, 16 and 16 respectively (:code:`BATCH` being set to
 # 1 implies that our compute building block is vector-matrix multiply).
 #

 ######################################################################
 # .. note::
 #
 #   **Data Types**
 #
 #   It's important to not only match the inner-tile
 #   dimension of VTA's tensor core, but also to match the specific data types
 #   expected by VTA.
 #   VTA for now only supports fixed point data types, which integer width is
 #   specified in the :code:`vta_config.json` file by :code:`INP_WIDTH` and
 #   :code:`WGT_WIDTH` for the activations and weights data types respectively.
 #   In addition, the accumulator data type integer width is specified by
 #   :code:`ACC_WIDTH`.
 #
 # By default, the configuration file sets :code:`INP_WIDTH`
 # and :code:`WGT_WIDTH` to 8.
 # The accumulator width :code:`ACC_WIDTH` is set to 32, in order to avoid
 # overflow during accumulation.
 # As a result, :code:`env.inp_dtype` and :code:`env.wgt_dtype` are all
 # narrow 8-bit integers, while :code:`env.acc_dtype` is a standard 32-bit
 # integer.

 # Output channel factor m - total 16x16=256 output channels
 m = 16
 # Input channel factor n - total 16x16=256 input channels
 n = 16
 # Batch factor o (we use single batch inference)
 o = 1
 # A placeholder tensor in tiled data format
 A = tvm.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="A", dtype=env.inp_dtype)
 # B placeholder tensor in tiled data format
 B = tvm.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="B", dtype=env.wgt_dtype)
 # A copy buffer
 A_buf = tvm.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: A(*i), "A_buf")
 # B copy buffer
 B_buf = tvm.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda *i: B(*i), "B_buf")

 ######################################################################
 # Matrix Multiplication
 # ~~~~~~~~~~~~~~~~~~~~~
 # Now we're ready to describe the matrix multiplication result tensor :code:`C`,
 # with another compute operation.
 # The compute function takes the shape of the tensor, as well as a lambda
 # function that describes the computation rule for each position of the tensor.
 #
 # In order to implement matrix multiplication, the lambda function needs to
 # include a reduction formula over the input channel dimension axes.
 # To create a reduction formula, we can declare a reduction axis using
 # :code:`tvm.reduce_axis`, which takes in the range of reductions.
 # :code:`tvm.sum` takes in the expression to be reduced as well as
 # the reduction axes to compute the sum of value over all k in the declared
 # ranges.
 #
 # Note that the reduction needs to be performed over 32-bit :code:`env.acc_dtype`
 # accumulator data types.
 #
 # No computation happens during this phase, as we are only declaring how
 # the computation should be done.

 # Outer input feature reduction axis
 ko = tvm.reduce_axis((0, n), name="ko")
 # Inner input feature reduction axis
 ki = tvm.reduce_axis((0, env.BLOCK_IN), name="ki")
 # Describe the in-VTA matrix multiplication
 C_buf = tvm.compute(
     (o, m, env.BATCH, env.BLOCK_OUT),
     lambda bo, co, bi, ci:
         tvm.sum(A_buf[bo, ko, bi, ki].astype(env.acc_dtype) *
                 B_buf[co, ko, ci, ki].astype(env.acc_dtype),
                 axis=[ko, ki]),
     name="C_buf")

 ######################################################################
 # Casting the Results
 # ~~~~~~~~~~~~~~~~~~~
 # After the computation is done, we'll need to send the results computed by VTA
 # back to main memory.

 ######################################################################
 # .. note::
 #
 #   **Memory Store Restrictions**
 #
 #   One specificity of VTA is that it only supports DRAM stores in the narrow
 #   :code:`env.inp_dtype` data type format.
 #   This lets us reduce the data footprint for memory transfers, but also lets
 #   us quantize the wide accumulator data type down to a data format that
 #   matches the input activation data type.
 #   This means that in the context of neural network inference, the outputs
 #   of a given layer after activation can be consumed directly by the next
 #   layer.
 #
 # We perform one last typecast operation to the narrow
 # input activation data format.

 # Cast to output type, and send to main memory
 C = tvm.compute(
     (o, m, env.BATCH, env.BLOCK_OUT),
     lambda *i: C_buf(*i).astype(env.inp_dtype),
     name="C")

 ######################################################################
 # This concludes the computation declaration part of this tutorial.

 ######################################################################
 # Scheduling the Computation
 # --------------------------
 # While the above lines describes the computation rule, we can obtain
 # :code:`C` in many ways.
 # TVM asks the user to provide an implementation of the computation called
 # *schedule*.
 #
 # A schedule is a set of transformations to an original computation that
 # transforms the implementation of the computation without affecting
 # correctness.
 # This simple VTA programming tutorial aims to demonstrate basic schedule
 # transformations that will map the original schedule down to VTA hardware
 # primitives.


 ######################################################################
 # Default Schedule
 # ~~~~~~~~~~~~~~~~
 # After we construct the schedule, by default the schedule computes
 # :code:`C` in the following way:

 # Let's take a look at the generated schedule
 s = tvm.create_schedule(C.op)
 print(tvm.lower(s, [A, B, C], simple_mode=True))

 ######################################################################
 # Although this schedule makes sense, it won't compile to VTA.
 # In order to obtain correct code generation, we need to apply scheduling
 # primitives and code annotation that will transform the schedule into
 # one that can be directly lowered onto VTA hardware intrinsics.
 # Those include:
 #
 #  - DMA copy operations which will take globally-scoped tensors and copy
 #    those into locally-scoped tensors.
 #  - Tensor operations that will perform the matrix multiplication.

 ######################################################################
 # Buffer Scopes
 # ~~~~~~~~~~~~~
 # First, we set the scope of the buffers to tell TVM that these buffers
 # will be living in the VTA's on-chip SRAM caches.
 # Below, we tell TVM that :code:`A_buf`, :code:`B_buf`, :code:`C_buf`
 # will respectively live in VTA's on-chip input, weight and accumulator
 # memory.

 ######################################################################
 # .. note::
 #
 #   **VTA's On-Chip SRAMs**
 #
 #   VTA has three different memory scopes, each corresponding to different
 #   on-chip SRAM buffers.
 #
 #    - :code:`env.inp_scope`: Input buffer, which is a read-only SRAM buffer
 #      that stores input matrices of shape :code:`(env.BATCH, env.BLOCK_IN)`
 #      of type :code:`env.inp_dtype`. The input buffer contains
 #      `2 ^ LOG_INP_BUFF_SIZE` matrix elements (as specified in the
 #      :code:`vta_config.json` file).
 #    - :code:`env.wgt_scope`: Weight buffer, which is a read-only SRAM buffer
 #      that stores weight matrices of shape :code:`(env.BLOCK_OUT, env.BLOCK_IN)`
 #      of type :code:`env.wgt_dtype`. The weight buffer contains
 #      `2 ^ LOG_WGT_BUFF_SIZE` matrix elements.
 #    - :code:`env.acc_scope`: Accumulator buffer, which is a read/write SRAM
 #      buffer that stores accumulator matrices of shape
 #      :code:`(env.BATCH, env.BLOCK_OUT)` of type :code:`env.acc_dtype`.
 #      The accumulator buffer is VTA's general purpose register file: it holds
 #      both intermediate results of convolutions and matrix multiplications
 #      as well as intermediate results of pooling, batch normalization, and
 #      activation layers. The accumulator buffer contains
 #      `2 ^ LOG_ACC_BUFF_SIZE` matrix elements.

 # Set the intermediate tensor's scope to VTA's on-chip buffers
 s[A_buf].set_scope(env.inp_scope)
 s[B_buf].set_scope(env.wgt_scope)
 s[C_buf].set_scope(env.acc_scope)

 ######################################################################
 # DMA Transfers
 # ~~~~~~~~~~~~~
 # We need to schedule DMA transfers to move data living in DRAM to
 # and from the VTA on-chip buffers.
 # This can be achieved using the :code:`compute_at` schedule primitive
 # which nests the copying of the buffers into the computation loop
 # that performs the matrix multiplication.
 #
 # We insert :code:`dma_copy` pragmas to indicate to the compiler
 # that the copy operations will be performed in bulk via DMA,
 # which is common in hardware accelerators.
 # Finally, we print the temporary schedule to observe the effects of
 # moving the copy operations into the matrix multiplication loop.

 # Move buffer copy into matrix multiply loop
 s[A_buf].compute_at(s[C_buf], ko)
 s[B_buf].compute_at(s[C_buf], ko)

 # Tag the buffer copies with the DMA pragma to insert a DMA transfer
 s[A_buf].pragma(s[A_buf].op.axis[0], env.dma_copy)
 s[B_buf].pragma(s[B_buf].op.axis[0], env.dma_copy)
 s[C].pragma(s[C].op.axis[0], env.dma_copy)

 # Let's take a look at the transformed schedule
 print(tvm.lower(s, [A, B, C], simple_mode=True))

 ######################################################################
 # Tensorization
 # ~~~~~~~~~~~~~
 # The last step of the schedule transformation consists in applying
 # *tensorization* to our schedule.
 # Tensorization is analogous to vectorization, but extends the concept
 # to a higher-dimensional unit of computation.
 # Consequently, tensorization imposes data layout constraints as discussed
 # earlier when declaring the data layout input placeholders.
 # We've already arranged our tensors in a tiled format, so the next thing
 # we need to perform is loop reordering to accommodate for tensorization.
 #
 # Here we choose to move the outermost reduction axis all the way out.
 # This dictates that we first iterate over input channels, then batch
 # dimensions, and finally output channels.
 # Lastly, we apply the tensorization scheduling primitive :code:`tensorize`
 # along the outer axis of the inner-most matrix matrix multiplication tensor
 # block.
 # We print the finalized schedule that is ready for code-generation
 # by the VTA runtime JIT compiler.

 s[C_buf].reorder(
     ko,
     s[C_buf].op.axis[0],
     s[C_buf].op.axis[1],
     s[C_buf].op.axis[2],
     s[C_buf].op.axis[3],
     ki)
 s[C_buf].tensorize(s[C_buf].op.axis[2], env.gemm)

 # Let's take a look at the finalized schedule
 print(vta.lower(s, [A, B, C], simple_mode=True))

 ######################################################################
 # This concludes the scheduling portion of this tutorial.

 ######################################################################
 # TVM Compilation
 # ---------------
 # After we have finished specifying the schedule, we can compile it
 # into a TVM function.

 # Build GEMM VTA kernel
 my_gemm = vta.build(s, [A, B, C], "ext_dev", env.target_host, name="my_gemm")

 # Write the compiled module into an object file.
 temp = util.tempdir()
 my_gemm.save(temp.relpath("gemm.o"))

 # Send the executable over RPC
 remote.upload(temp.relpath("gemm.o"))

 # Load the compiled module
 f = remote.load_module("gemm.o")

 ######################################################################
 # Running the Function
 # --------------------
 # The compiled TVM function uses a concise C API and can be invoked from
 # code language.
 #
 # TVM provides an array API in python to aid quick testing and prototyping.
 # The array API is based on `DLPack <https://github.com/dmlc/dlpack>`_ standard.
 #
 # - We first create a remote context (for remote execution on the Pynq).
 # - Then :code:`tvm.nd.array` formats the data accordingly.
 # - :code:`f()` runs the actual computation.
 # - :code:`asnumpy()` copies the result array back in a format that can be
 #   interpreted.
 #

 # Get the remote device context
 ctx = remote.ext_dev(0)

 # Initialize the A and B arrays randomly in the int range of (-128, 128]
 A_orig = np.random.randint(
     -128, 128, size=(o * env.BATCH, n * env.BLOCK_IN)).astype(A.dtype)
 B_orig = np.random.randint(
     -128, 128, size=(m * env.BLOCK_OUT, n * env.BLOCK_IN)).astype(B.dtype)

 # Apply packing to the A and B arrays from a 2D to a 4D packed layout
 A_packed = A_orig.reshape(
     o, env.BATCH, n, env.BLOCK_IN).transpose((0, 2, 1, 3))
 B_packed = B_orig.reshape(
     m, env.BLOCK_OUT, n, env.BLOCK_IN).transpose((0, 2, 1, 3))

 # Format the input/output arrays with tvm.nd.array to the DLPack standard
 A_nd = tvm.nd.array(A_packed, ctx)
 B_nd = tvm.nd.array(B_packed, ctx)
 C_nd = tvm.nd.array(np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(C.dtype), ctx)

 # Clear stats
 if env.TARGET in ["sim", "tsim"]:
     simulator.clear_stats()

 # Invoke the module to perform the computation
 f(A_nd, B_nd, C_nd)

 ######################################################################
 # Verifying Correctness
 # ---------------------
 # Compute the reference result with numpy and assert that the output of the
 # matrix multiplication indeed is correct

 # Compute reference result with numpy
 C_ref = np.dot(A_orig.astype(env.acc_dtype),
                B_orig.T.astype(env.acc_dtype)).astype(C.dtype)
 C_ref = C_ref.reshape(
     o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
 np.testing.assert_equal(C_ref, C_nd.asnumpy())

 # Print stats
 if env.TARGET in ["sim", "tsim"]:
     sim_stats = simulator.stats()
     print("Execution statistics:")
     for k, v in sim_stats.items():
         print("\t{:<16}: {:>16}".format(k, v))

 print("Successful matrix multiply test!")

 ######################################################################
 # Summary
 # -------
 # This tutorial showcases the TVM workflow to implement a simple matrix
 # multiplication example on VTA.
 # The general workflow includes:
 #
 # - Programming the FPGA with the VTA bitstream over RPC.
 # - Describing matrix multiplication via a series of computations.
 # - Describing how we want to perform the computation using schedule primitives.
 # - Compiling the function to the VTA target.
 # - Running the compiled module and verifying it against a numpy implementation.
 #
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	"""
	.. _basic-mat-mult:

	Simple Matrix Multiply
	======================
	Author: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_

	In this tutorial, we will build on top of the :ref:`vta-get-started` tutorial
	and introduce additional concepts required to implement matrix multiplication
	on VTA with the TVM workflow.
	"""

	######################################################################
	# RPC Setup
	# ---------
	# We start by programming the Pynq's FPGA and building its RPC runtime
	# as we did in the VTA introductory tutorial.

	from __future__ import absolute_import, print_function

	import os
	import tvm
	import vta
	import numpy as np
	from tvm import rpc
	from tvm.contrib import util
	from vta.testing import simulator

	# Load VTA parameters from the vta/config/vta_config.json file
	env = vta.get_env()

	# We read the Pynq RPC host IP address and port number from the OS environment
	host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
	port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))

	# We configure both the bitstream and the runtime system on the Pynq
	# to match the VTA configuration specified by the vta_config.json file.
	if env.TARGET == "pynq":

	# Make sure that TVM was compiled with RPC=1
	assert tvm.runtime.enabled("rpc")
	remote = rpc.connect(host, port)

	# Reconfigure the JIT runtime
	vta.reconfig_runtime(remote)

	# Program the FPGA with a pre-compiled VTA bitstream.
	# You can program the FPGA with your own custom bitstream
	# by passing the path to the bitstream file instead of None.
	vta.program_fpga(remote, bitstream=None)

	# In simulation mode, host the RPC server locally.
	elif env.TARGET in ["sim", "tsim"]:
	remote = rpc.LocalSession()

	######################################################################
	# Computation Declaration
	# -----------------------
	# In this example we describe a simple matrix multiplication addition, which
	# requires multiple computation stages, as shown in the dataflow diagram below.
	# First we describe the input tensors :code:`A` and :code:`B` that are living
	# in main memory.
	# Second, we need to declare intermediate tensors :code:`A_buf` and
	# :code:`B_buf`, which will live in VTA's on-chip buffers.
	# Having this extra computational stage allows us to explicitly
	# stage cached reads and writes.
	# Third, we describe the matrix multiplication computation over
	# :code:`A_buf` and :code:`B_buf` to produce the product matrix :code:`C_buf`.
	# The last operation is a cast and copy back to DRAM, into results tensor
	# :code:`C`.
	#
	# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/gemm_dataflow.png
	# :align: center

	######################################################################
	# Data Layout
	# ~~~~~~~~~~~
	# We describe the placeholder tensors :code:`A`, and :code:`B` in a tiled data
	# format to match the data layout requirements imposed by the VTA tensor core.

	######################################################################
	# .. note::
	#
	# Data Tiling
	#
	# One source of complexity when targeting accelerators is to make sure
	# that the data layout matches the layout imposed by the accelerator design.
	# VTA is designed around a tensor core that performs, one matrix-matrix
	# operation per cycle between an activation matrix and a weight matrix,
	# adding the result matrix to an accumulator matrix, as shown in the
	# figure below.
	#
	# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/tensor_core.png
	# :align: center
	# :width: 480px
	#
	# The dimensions of that matrix-matrix multiplication are specified in
	# the :code:`vta_config.json` configuration file.
	# The activation matrix has a :code:`(BATCH, BLOCK_IN)` shape
	# and the transposed weight matrix has a :code:`(BLOCK_OUT, BLOCK_IN)` shape,
	# thus inferring that the resulting output matrix has a
	# :code:`(BATCH, BLOCK_OUT)` shape.
	# Consequently input and output tensors processed by VTA need to be
	# tiled according to these aforementioned dimension.
	#
	# The diagram below shows the impact of data tiling on a matrix that is
	# originally of shape (4, 8).
	# Tiling by a (2, 2) tile shape ensures that data within each tile is
	# contiguous.
	# The resulting tiled tensor has a shape of (2, 4, 2, 2).
	#
	# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/data_tiling.png
	# :align: center
	# :width: 480px
	#
	# We first define the variables :code:`m`, :code:`n`, :code:`o` to represent
	# the shape of the matrix multiplication. These variables are multiplicative
	# factors over the :code:`BLOCK_OUT`, :code:`BLOCK_IN`, and :code:`BATCH`
	# tensor dimensions respectively.
	# By default, the configuration file sets :code:`BATCH`, :code:`BLOCK_IN`, and
	# :code:`BLOCK_OUT` to be 1, 16 and 16 respectively (:code:`BATCH` being set to
	# 1 implies that our compute building block is vector-matrix multiply).
	#

	######################################################################
	# .. note::
	#
	# Data Types
	#
	# It's important to not only match the inner-tile
	# dimension of VTA's tensor core, but also to match the specific data types
	# expected by VTA.
	# VTA for now only supports fixed point data types, which integer width is
	# specified in the :code:`vta_config.json` file by :code:`INP_WIDTH` and
	# :code:`WGT_WIDTH` for the activations and weights data types respectively.
	# In addition, the accumulator data type integer width is specified by
	# :code:`ACC_WIDTH`.
	#
	# By default, the configuration file sets :code:`INP_WIDTH`
	# and :code:`WGT_WIDTH` to 8.
	# The accumulator width :code:`ACC_WIDTH` is set to 32, in order to avoid
	# overflow during accumulation.
	# As a result, :code:`env.inp_dtype` and :code:`env.wgt_dtype` are all
	# narrow 8-bit integers, while :code:`env.acc_dtype` is a standard 32-bit
	# integer.

	# Output channel factor m - total 16x16=256 output channels
	m = 16
	# Input channel factor n - total 16x16=256 input channels
	n = 16
	# Batch factor o (we use single batch inference)
	o = 1
	# A placeholder tensor in tiled data format
	A = tvm.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="A", dtype=env.inp_dtype)
	# B placeholder tensor in tiled data format
	B = tvm.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="B", dtype=env.wgt_dtype)
	# A copy buffer
	A_buf = tvm.compute((o, n, env.BATCH, env.BLOCK_IN), lambda i: A(i), "A_buf")
	# B copy buffer
	B_buf = tvm.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda i: B(i), "B_buf")

	######################################################################
	# Matrix Multiplication
	# ~~~~~~~~~~~~~~~~~~~~~
	# Now we're ready to describe the matrix multiplication result tensor :code:`C`,
	# with another compute operation.
	# The compute function takes the shape of the tensor, as well as a lambda
	# function that describes the computation rule for each position of the tensor.
	#
	# In order to implement matrix multiplication, the lambda function needs to
	# include a reduction formula over the input channel dimension axes.
	# To create a reduction formula, we can declare a reduction axis using
	# :code:`tvm.reduce_axis`, which takes in the range of reductions.
	# :code:`tvm.sum` takes in the expression to be reduced as well as
	# the reduction axes to compute the sum of value over all k in the declared
	# ranges.
	#
	# Note that the reduction needs to be performed over 32-bit :code:`env.acc_dtype`
	# accumulator data types.
	#
	# No computation happens during this phase, as we are only declaring how
	# the computation should be done.

	# Outer input feature reduction axis
	ko = tvm.reduce_axis((0, n), name="ko")
	# Inner input feature reduction axis
	ki = tvm.reduce_axis((0, env.BLOCK_IN), name="ki")
	# Describe the in-VTA matrix multiplication
	C_buf = tvm.compute(
	(o, m, env.BATCH, env.BLOCK_OUT),
	lambda bo, co, bi, ci:
	tvm.sum(A_buf[bo, ko, bi, ki].astype(env.acc_dtype) *
	B_buf[co, ko, ci, ki].astype(env.acc_dtype),
	axis=[ko, ki]),
	name="C_buf")

	######################################################################
	# Casting the Results
	# ~~~~~~~~~~~~~~~~~~~
	# After the computation is done, we'll need to send the results computed by VTA
	# back to main memory.

	######################################################################
	# .. note::
	#
	# Memory Store Restrictions
	#
	# One specificity of VTA is that it only supports DRAM stores in the narrow
	# :code:`env.inp_dtype` data type format.
	# This lets us reduce the data footprint for memory transfers, but also lets
	# us quantize the wide accumulator data type down to a data format that
	# matches the input activation data type.
	# This means that in the context of neural network inference, the outputs
	# of a given layer after activation can be consumed directly by the next
	# layer.
	#
	# We perform one last typecast operation to the narrow
	# input activation data format.

	# Cast to output type, and send to main memory
	C = tvm.compute(
	(o, m, env.BATCH, env.BLOCK_OUT),
	lambda i: C_buf(i).astype(env.inp_dtype),
	name="C")

	######################################################################
	# This concludes the computation declaration part of this tutorial.

	######################################################################
	# Scheduling the Computation
	# --------------------------
	# While the above lines describes the computation rule, we can obtain
	# :code:`C` in many ways.
	# TVM asks the user to provide an implementation of the computation called
	# schedule.
	#
	# A schedule is a set of transformations to an original computation that
	# transforms the implementation of the computation without affecting
	# correctness.
	# This simple VTA programming tutorial aims to demonstrate basic schedule
	# transformations that will map the original schedule down to VTA hardware
	# primitives.


	######################################################################
	# Default Schedule
	# ~~~~~~~~~~~~~~~~
	# After we construct the schedule, by default the schedule computes
	# :code:`C` in the following way:

	# Let's take a look at the generated schedule
	s = tvm.create_schedule(C.op)
	print(tvm.lower(s, [A, B, C], simple_mode=True))

	######################################################################
	# Although this schedule makes sense, it won't compile to VTA.
	# In order to obtain correct code generation, we need to apply scheduling
	# primitives and code annotation that will transform the schedule into
	# one that can be directly lowered onto VTA hardware intrinsics.
	# Those include:
	#
	# - DMA copy operations which will take globally-scoped tensors and copy
	# those into locally-scoped tensors.
	# - Tensor operations that will perform the matrix multiplication.

	######################################################################
	# Buffer Scopes
	# ~~~~~~~~~~~~~
	# First, we set the scope of the buffers to tell TVM that these buffers
	# will be living in the VTA's on-chip SRAM caches.
	# Below, we tell TVM that :code:`A_buf`, :code:`B_buf`, :code:`C_buf`
	# will respectively live in VTA's on-chip input, weight and accumulator
	# memory.

	######################################################################
	# .. note::
	#
	# VTA's On-Chip SRAMs
	#
	# VTA has three different memory scopes, each corresponding to different
	# on-chip SRAM buffers.
	#
	# - :code:`env.inp_scope`: Input buffer, which is a read-only SRAM buffer
	# that stores input matrices of shape :code:`(env.BATCH, env.BLOCK_IN)`
	# of type :code:`env.inp_dtype`. The input buffer contains
	# `2 ^ LOG_INP_BUFF_SIZE` matrix elements (as specified in the
	# :code:`vta_config.json` file).
	# - :code:`env.wgt_scope`: Weight buffer, which is a read-only SRAM buffer
	# that stores weight matrices of shape :code:`(env.BLOCK_OUT, env.BLOCK_IN)`
	# of type :code:`env.wgt_dtype`. The weight buffer contains
	# `2 ^ LOG_WGT_BUFF_SIZE` matrix elements.
	# - :code:`env.acc_scope`: Accumulator buffer, which is a read/write SRAM
	# buffer that stores accumulator matrices of shape
	# :code:`(env.BATCH, env.BLOCK_OUT)` of type :code:`env.acc_dtype`.
	# The accumulator buffer is VTA's general purpose register file: it holds
	# both intermediate results of convolutions and matrix multiplications
	# as well as intermediate results of pooling, batch normalization, and
	# activation layers. The accumulator buffer contains
	# `2 ^ LOG_ACC_BUFF_SIZE` matrix elements.

	# Set the intermediate tensor's scope to VTA's on-chip buffers
	s[A_buf].set_scope(env.inp_scope)
	s[B_buf].set_scope(env.wgt_scope)
	s[C_buf].set_scope(env.acc_scope)

	######################################################################
	# DMA Transfers
	# ~~~~~~~~~~~~~
	# We need to schedule DMA transfers to move data living in DRAM to
	# and from the VTA on-chip buffers.
	# This can be achieved using the :code:`compute_at` schedule primitive
	# which nests the copying of the buffers into the computation loop
	# that performs the matrix multiplication.
	#
	# We insert :code:`dma_copy` pragmas to indicate to the compiler
	# that the copy operations will be performed in bulk via DMA,
	# which is common in hardware accelerators.
	# Finally, we print the temporary schedule to observe the effects of
	# moving the copy operations into the matrix multiplication loop.

	# Move buffer copy into matrix multiply loop
	s[A_buf].compute_at(s[C_buf], ko)
	s[B_buf].compute_at(s[C_buf], ko)

	# Tag the buffer copies with the DMA pragma to insert a DMA transfer
	s[A_buf].pragma(s[A_buf].op.axis[0], env.dma_copy)
	s[B_buf].pragma(s[B_buf].op.axis[0], env.dma_copy)
	s[C].pragma(s[C].op.axis[0], env.dma_copy)

	# Let's take a look at the transformed schedule
	print(tvm.lower(s, [A, B, C], simple_mode=True))

	######################################################################
	# Tensorization
	# ~~~~~~~~~~~~~
	# The last step of the schedule transformation consists in applying
	# tensorization to our schedule.
	# Tensorization is analogous to vectorization, but extends the concept
	# to a higher-dimensional unit of computation.
	# Consequently, tensorization imposes data layout constraints as discussed
	# earlier when declaring the data layout input placeholders.
	# We've already arranged our tensors in a tiled format, so the next thing
	# we need to perform is loop reordering to accommodate for tensorization.
	#
	# Here we choose to move the outermost reduction axis all the way out.
	# This dictates that we first iterate over input channels, then batch
	# dimensions, and finally output channels.
	# Lastly, we apply the tensorization scheduling primitive :code:`tensorize`
	# along the outer axis of the inner-most matrix matrix multiplication tensor
	# block.
	# We print the finalized schedule that is ready for code-generation
	# by the VTA runtime JIT compiler.

	s[C_buf].reorder(
	ko,
	s[C_buf].op.axis[0],
	s[C_buf].op.axis[1],
	s[C_buf].op.axis[2],
	s[C_buf].op.axis[3],
	ki)
	s[C_buf].tensorize(s[C_buf].op.axis[2], env.gemm)

	# Let's take a look at the finalized schedule
	print(vta.lower(s, [A, B, C], simple_mode=True))

	######################################################################
	# This concludes the scheduling portion of this tutorial.

	######################################################################
	# TVM Compilation
	# ---------------
	# After we have finished specifying the schedule, we can compile it
	# into a TVM function.

	# Build GEMM VTA kernel
	my_gemm = vta.build(s, [A, B, C], "ext_dev", env.target_host, name="my_gemm")

	# Write the compiled module into an object file.
	temp = util.tempdir()
	my_gemm.save(temp.relpath("gemm.o"))

	# Send the executable over RPC
	remote.upload(temp.relpath("gemm.o"))

	# Load the compiled module
	f = remote.load_module("gemm.o")

	######################################################################
	# Running the Function
	# --------------------
	# The compiled TVM function uses a concise C API and can be invoked from
	# code language.
	#
	# TVM provides an array API in python to aid quick testing and prototyping.
	# The array API is based on `DLPack <https://github.com/dmlc/dlpack>`_ standard.
	#
	# - We first create a remote context (for remote execution on the Pynq).
	# - Then :code:`tvm.nd.array` formats the data accordingly.
	# - :code:`f()` runs the actual computation.
	# - :code:`asnumpy()` copies the result array back in a format that can be
	# interpreted.
	#

	# Get the remote device context
	ctx = remote.ext_dev(0)

	# Initialize the A and B arrays randomly in the int range of (-128, 128]
	A_orig = np.random.randint(
	-128, 128, size=(o * env.BATCH, n * env.BLOCK_IN)).astype(A.dtype)
	B_orig = np.random.randint(
	-128, 128, size=(m * env.BLOCK_OUT, n * env.BLOCK_IN)).astype(B.dtype)

	# Apply packing to the A and B arrays from a 2D to a 4D packed layout
	A_packed = A_orig.reshape(
	o, env.BATCH, n, env.BLOCK_IN).transpose((0, 2, 1, 3))
	B_packed = B_orig.reshape(
	m, env.BLOCK_OUT, n, env.BLOCK_IN).transpose((0, 2, 1, 3))

	# Format the input/output arrays with tvm.nd.array to the DLPack standard
	A_nd = tvm.nd.array(A_packed, ctx)
	B_nd = tvm.nd.array(B_packed, ctx)
	C_nd = tvm.nd.array(np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(C.dtype), ctx)

	# Clear stats
	if env.TARGET in ["sim", "tsim"]:
	simulator.clear_stats()

	# Invoke the module to perform the computation
	f(A_nd, B_nd, C_nd)

	######################################################################
	# Verifying Correctness
	# ---------------------
	# Compute the reference result with numpy and assert that the output of the
	# matrix multiplication indeed is correct

	# Compute reference result with numpy
	C_ref = np.dot(A_orig.astype(env.acc_dtype),
	B_orig.T.astype(env.acc_dtype)).astype(C.dtype)
	C_ref = C_ref.reshape(
	o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3))
	np.testing.assert_equal(C_ref, C_nd.asnumpy())

	# Print stats
	if env.TARGET in ["sim", "tsim"]:
	sim_stats = simulator.stats()
	print("Execution statistics:")
	for k, v in sim_stats.items():
	print("\t{:<16}: {:>16}".format(k, v))

	print("Successful matrix multiply test!")

	######################################################################
	# Summary
	# -------
	# This tutorial showcases the TVM workflow to implement a simple matrix
	# multiplication example on VTA.
	# The general workflow includes:
	#
	# - Programming the FPGA with the VTA bitstream over RPC.
	# - Describing matrix multiplication via a series of computations.
	# - Describing how we want to perform the computation using schedule primitives.
	# - Compiling the function to the VTA target.
	# - Running the compiled module and verifying it against a numpy implementation.
	#