| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| """ |
| Deploy Pretrained Vision Model from MxNet on VTA |
| ================================================ |
| **Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_ |
| |
| This tutorial provides an end-to-end demo, on how to run ImageNet classification |
| inference onto the VTA accelerator design to perform ImageNet classification tasks. |
| It showcases Relay as a front end compiler that can perform quantization (VTA |
| only supports int8/32 inference) as well as graph packing (in order to enable |
| tensorization in the core) to massage the compute graph for the hardware target. |
| """ |
| |
| ###################################################################### |
| # Install dependencies |
| # -------------------- |
| # To use the autotvm package in tvm, we need to install some extra dependencies. |
| # (change "3" to "2" if you use python2): |
| # |
| # .. code-block:: bash |
| # |
| # pip3 install --user mxnet requests "Pillow<7" |
| # |
| # Now return to the python code. Import packages. |
| |
| from __future__ import absolute_import, print_function |
| |
| import argparse, json, os, requests, sys, time |
| from io import BytesIO |
| from os.path import join, isfile |
| from PIL import Image |
| |
| from mxnet.gluon.model_zoo import vision |
| import numpy as np |
| from matplotlib import pyplot as plt |
| |
| import tvm |
| from tvm import te |
| from tvm import rpc, autotvm, relay |
| from tvm.contrib import graph_executor, utils, download, graph_runtime |
| from tvm.contrib.debugger import debug_executor |
| from tvm.relay import transform |
| |
| import vta |
| from vta.testing import simulator |
| from vta.top import graph_pack |
| |
| |
| # Make sure that TVM was compiled with RPC=1 |
| assert tvm.runtime.enabled("rpc") |
| |
| ###################################################################### |
| # Define the platform and model targets |
| # ------------------------------------- |
| # Execute on CPU vs. VTA, and define the model. |
| |
| # Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file |
| env = vta.get_env() |
| |
| # Set ``device=arm_cpu`` to run inference on the CPU |
| # or ``device=vta`` to run inference on the FPGA. |
| device = "vta" |
| target = env.target if device == "vta" else env.target_vta_cpu |
| |
| # Dictionary lookup for when to start/end bit packing |
| pack_dict = { |
| "resnet18_v1": ["nn.max_pool2d", "nn.global_avg_pool2d"], |
| "resnet34_v1": ["nn.max_pool2d", "nn.global_avg_pool2d"], |
| "resnet18_v2": ["nn.max_pool2d", "nn.global_avg_pool2d"], |
| "resnet34_v2": ["nn.max_pool2d", "nn.global_avg_pool2d"], |
| "resnet50_v2": ["nn.max_pool2d", "nn.global_avg_pool2d"], |
| "resnet101_v2": ["nn.max_pool2d", "nn.global_avg_pool2d"], |
| } |
| |
| # Name of Gluon model to compile |
| # The ``start_pack`` and ``stop_pack`` labels indicate where |
| # to start and end the graph packing relay pass: in other words |
| # where to start and finish offloading to VTA. |
| model = "resnet18_v1" |
| assert model in pack_dict |
| |
| ###################################################################### |
| # Obtain an execution remote |
| # -------------------------- |
| # When target is 'pynq', reconfigure FPGA and runtime. |
| # Otherwise, if target is 'sim', execute locally. |
| |
| if env.TARGET not in ["sim", "tsim", "intelfocl"]: |
| |
| # Get remote from tracker node if environment variable is set. |
| # To set up the tracker, you'll need to follow the "Auto-tuning |
| # a convolutional network for VTA" tutorial. |
| tracker_host = os.environ.get("TVM_TRACKER_HOST", None) |
| tracker_port = os.environ.get("TVM_TRACKER_PORT", None) |
| # Otherwise if you have a device you want to program directly from |
| # the host, make sure you've set the variables below to the IP of |
| # your board. |
| device_host = os.environ.get("VTA_RPC_HOST", "192.168.2.99") |
| device_port = os.environ.get("VTA_RPC_PORT", "9091") |
| if not tracker_host or not tracker_port: |
| remote = rpc.connect(device_host, int(device_port)) |
| else: |
| remote = autotvm.measure.request_remote( |
| env.TARGET, tracker_host, int(tracker_port), timeout=10000 |
| ) |
| |
| # Reconfigure the JIT runtime and FPGA. |
| # You can program the FPGA with your own custom bitstream |
| # by passing the path to the bitstream file instead of None. |
| reconfig_start = time.time() |
| vta.reconfig_runtime(remote) |
| vta.program_fpga(remote, bitstream=None) |
| reconfig_time = time.time() - reconfig_start |
| print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time)) |
| |
| # In simulation mode, host the RPC server locally. |
| else: |
| remote = rpc.LocalSession() |
| |
| if env.TARGET in ["intelfocl"]: |
| # program intelfocl aocx |
| vta.program_fpga(remote, bitstream="vta.bitstream") |
| |
| # Get execution context from remote |
| ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) |
| |
| ###################################################################### |
| # Build the inference graph executor |
| # --------------------------------- |
| # Grab vision model from Gluon model zoo and compile with Relay. |
| # The compilation steps are: |
| # |
| # 1. Front end translation from MxNet into Relay module. |
| # 2. Apply 8-bit quantization: here we skip the first conv layer, |
| # and dense layer which will both be executed in fp32 on the CPU. |
| # 3. Perform graph packing to alter the data layout for tensorization. |
| # 4. Perform constant folding to reduce number of operators (e.g. eliminate batch norm multiply). |
| # 5. Perform relay build to object file. |
| # 6. Load the object file onto remote (FPGA device). |
| # 7. Generate graph executor, `m`. |
| # |
| |
| # Load pre-configured AutoTVM schedules |
| with autotvm.tophub.context(target): |
| |
| # Populate the shape and data type dictionary for ImageNet classifier input |
| dtype_dict = {"data": "float32"} |
| shape_dict = {"data": (env.BATCH, 3, 224, 224)} |
| |
| # Get off the shelf gluon model, and convert to relay |
| gluon_model = vision.get_model(model, pretrained=True) |
| |
| # Measure build start time |
| build_start = time.time() |
| |
| # Start front end compilation |
| mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) |
| |
| # Update shape and type dictionary |
| shape_dict.update({k: v.shape for k, v in params.items()}) |
| dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) |
| |
| if target.device_name == "vta": |
| # Perform quantization in Relay |
| # Note: We set opt_level to 3 in order to fold batch norm |
| with tvm.transform.PassContext(opt_level=3): |
| with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): |
| mod = relay.quantize.quantize(mod, params=params) |
| # Perform graph packing and constant folding for VTA target |
| assert env.BLOCK_IN == env.BLOCK_OUT |
| # do device annotation if target is intelfocl or sim |
| relay_prog = graph_pack( |
| mod["main"], |
| env.BATCH, |
| env.BLOCK_OUT, |
| env.WGT_WIDTH, |
| start_name=pack_dict[model][0], |
| stop_name=pack_dict[model][1], |
| device_annot=(env.TARGET == "intelfocl" or env.TARGET == "sim"), |
| ) |
| else: |
| relay_prog = mod["main"] |
| |
| # Compile Relay program with AlterOpLayout disabled |
| if target.device_name != "vta": |
| with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): |
| graph, lib, params = relay.build( |
| relay_prog, target=target, params=params, target_host=env.target_host |
| ) |
| else: |
| if env.TARGET == "intelfocl" or env.TARGET == "sim": |
| # multiple targets to run both on cpu and vta |
| target = {"cpu": env.target_vta_cpu, "ext_dev": target} |
| with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): |
| graph, lib, params = relay.build( |
| relay_prog, target=target, params=params, target_host=env.target_host |
| ) |
| |
| # Measure Relay build time |
| build_time = time.time() - build_start |
| print(model + " inference graph built in {0:.2f}s!".format(build_time)) |
| |
| # Send the inference library over to the remote RPC server |
| temp = utils.tempdir() |
| lib.export_library(temp.relpath("graphlib.tar")) |
| remote.upload(temp.relpath("graphlib.tar")) |
| lib = remote.load_module("graphlib.tar") |
| |
| if env.TARGET == "intelfocl" or env.TARGET == "sim": |
| ctxes = [remote.ext_dev(0), remote.cpu(0)] |
| m = graph_runtime.create(graph, lib, ctxes) |
| else: |
| # Graph runtime |
| m = graph_runtime.create(graph, lib, ctx) |
| |
| ###################################################################### |
| # Perform image classification inference |
| # -------------------------------------- |
| # We run classification on an image sample from ImageNet |
| # We just need to download the categories files, `synset.txt` |
| # and an input test image. |
| |
| # Download ImageNet categories |
| categ_url = "https://github.com/uwsampl/web-data/raw/main/vta/models/" |
| categ_fn = "synset.txt" |
| download.download(join(categ_url, categ_fn), categ_fn) |
| synset = eval(open(categ_fn).read()) |
| |
| # Download test image |
| image_url = "https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg" |
| image_fn = "cat.png" |
| download.download(image_url, image_fn) |
| |
| # Prepare test image for inference |
| image = Image.open(image_fn).resize((224, 224)) |
| plt.imshow(image) |
| plt.show() |
| image = np.array(image) - np.array([123.0, 117.0, 104.0]) |
| image /= np.array([58.395, 57.12, 57.375]) |
| image = image.transpose((2, 0, 1)) |
| image = image[np.newaxis, :] |
| image = np.repeat(image, env.BATCH, axis=0) |
| |
| # Set the network parameters and inputs |
| m.set_input(**params) |
| m.set_input("data", image) |
| |
| # Perform inference and gather execution statistics |
| # More on: :py:method:`tvm.runtime.Module.time_evaluator` |
| num = 4 # number of times we run module for a single measurement |
| rep = 3 # number of measurements (we derive std dev from this) |
| timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep) |
| |
| if env.TARGET in ["sim", "tsim"]: |
| simulator.clear_stats() |
| timer() |
| sim_stats = simulator.stats() |
| print("\nExecution statistics:") |
| for k, v in sim_stats.items(): |
| # Since we execute the workload many times, we need to normalize stats |
| # Note that there is always one warm up run |
| # Therefore we divide the overall stats by (num * rep + 1) |
| print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1))) |
| else: |
| tcost = timer() |
| std = np.std(tcost.results) * 1000 |
| mean = tcost.mean * 1000 |
| print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH)) |
| print("Average per sample inference time: %.2fms" % (mean / env.BATCH)) |
| |
| # Get classification results |
| tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 1000), "float32", remote.cpu(0))) |
| for b in range(env.BATCH): |
| top_categories = np.argsort(tvm_output.asnumpy()[b]) |
| # Report top-5 classification results |
| print("\n{} prediction for sample {}".format(model, b)) |
| print("\t#1:", synset[top_categories[-1]]) |
| print("\t#2:", synset[top_categories[-2]]) |
| print("\t#3:", synset[top_categories[-3]]) |
| print("\t#4:", synset[top_categories[-4]]) |
| print("\t#5:", synset[top_categories[-5]]) |
| # This just checks that one of the 5 top categories |
| # is one variety of cat; this is by no means an accurate |
| # assessment of how quantization affects classification |
| # accuracy but is meant to catch changes to the |
| # quantization pass that would accuracy in the CI. |
| cat_detected = False |
| for k in top_categories[-5:]: |
| if "cat" in synset[k]: |
| cat_detected = True |
| assert cat_detected |