tutorials/get_started/relay_quick_start.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 """
 .. _tutorial-relay-quick-start:

 Quick Start Tutorial for Compiling Deep Learning Models
 =======================================================
 **Author**: `Yao Wang <https://github.com/kevinthesun>`_, `Truman Tian <https://github.com/SiNZeRo>`_

 This example shows how to build a neural network with Relay python frontend and
 generates a runtime library for Nvidia GPU with TVM.
 Notice that you need to build TVM with cuda and llvm enabled.
 """

 ######################################################################
 # Overview for Supported Hardware Backend of TVM
 # ----------------------------------------------
 # The image below shows hardware backend currently supported by TVM:
 #
 # .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/tvm_support_list.png
 #      :align: center
 #
 # In this tutorial, we'll choose cuda and llvm as target backends.
 # To begin with, let's import Relay and TVM.

 import numpy as np

 from tvm import relay
 from tvm.relay import testing
 import tvm
 from tvm import te
 from tvm.contrib import graph_runtime

 ######################################################################
 # Define Neural Network in Relay
 # ------------------------------
 # First, let's define a neural network with relay python frontend.
 # For simplicity, we'll use pre-defined resnet-18 network in Relay.
 # Parameters are initialized with Xavier initializer.
 # Relay also supports other model formats such as MXNet, CoreML, ONNX and
 # Tensorflow.
 #
 # In this tutorial, we assume we will do inference on our device
 # and the batch size is set to be 1. Input images are RGB color
 # images of size 224 * 224. We can call the :any:`tvm.relay.TupleWrapper.astext()`
 # to show the network structure.

 batch_size = 1
 num_class = 1000
 image_shape = (3, 224, 224)
 data_shape = (batch_size,) + image_shape
 out_shape = (batch_size, num_class)

 mod, params = relay.testing.resnet.get_workload(
     num_layers=18, batch_size=batch_size, image_shape=image_shape
 )

 # set show_meta_data=True if you want to show meta data
 print(mod.astext(show_meta_data=False))

 ######################################################################
 # Compilation
 # -----------
 # Next step is to compile the model using the Relay/TVM pipeline.
 # Users can specify the optimization level of the compilation.
 # Currently this value can be 0 to 3. The optimization passes include
 # operator fusion, pre-computation, layout transformation and so on.
 #
 # :py:func:`relay.build` returns three components: the execution graph in
 # json format, the TVM module library of compiled functions specifically
 # for this graph on the target hardware, and the parameter blobs of
 # the model. During the compilation, Relay does the graph-level
 # optimization while TVM does the tensor-level optimization, resulting
 # in an optimized runtime module for model serving.
 #
 # We'll first compile for Nvidia GPU. Behind the scene, :py:func:`relay.build`
 # first does a number of graph-level optimizations, e.g. pruning, fusing, etc.,
 # then registers the operators (i.e. the nodes of the optimized graphs) to
 # TVM implementations to generate a `tvm.module`.
 # To generate the module library, TVM will first transfer the high level IR
 # into the lower intrinsic IR of the specified target backend, which is CUDA
 # in this example. Then the machine code will be generated as the module library.

 opt_level = 3
 target = tvm.target.cuda()
 with tvm.transform.PassContext(opt_level=opt_level):
     lib = relay.build(mod, target, params=params)

 #####################################################################
 # Run the generate library
 # ------------------------
 # Now we can create graph runtime and run the module on Nvidia GPU.

 # create random input
 ctx = tvm.gpu()
 data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
 # create module
 module = graph_runtime.GraphModule(lib["default"](ctx))
 # set input and parameters
 module.set_input("data", data)
 # run
 module.run()
 # get output
 out = module.get_output(0, tvm.nd.empty(out_shape)).asnumpy()

 # Print first 10 elements of output
 print(out.flatten()[0:10])

 ######################################################################
 # Save and Load Compiled Module
 # -----------------------------
 # We can also save the graph, lib and parameters into files and load them
 # back in deploy environment.

 ####################################################

 # save the graph, lib and params into separate files
 from tvm.contrib import util

 temp = util.tempdir()
 path_lib = temp.relpath("deploy_lib.tar")
 lib.export_library(path_lib)
 print(temp.listdir())

 ####################################################

 # load the module back.
 loaded_lib = tvm.runtime.load_module(path_lib)
 input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))

 module = graph_runtime.GraphModule(loaded_lib["default"](ctx))
 module.run(data=input_data)
 out_deploy = module.get_output(0).asnumpy()

 # Print first 10 elements of output
 print(out_deploy.flatten()[0:10])

 # check whether the output from deployed module is consistent with original one
 tvm.testing.assert_allclose(out_deploy, out, atol=1e-3)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	"""
	.. _tutorial-relay-quick-start:

	Quick Start Tutorial for Compiling Deep Learning Models
	=======================================================
	Author: `Yao Wang <https://github.com/kevinthesun>`_, `Truman Tian <https://github.com/SiNZeRo>`_

	This example shows how to build a neural network with Relay python frontend and
	generates a runtime library for Nvidia GPU with TVM.
	Notice that you need to build TVM with cuda and llvm enabled.
	"""

	######################################################################
	# Overview for Supported Hardware Backend of TVM
	# ----------------------------------------------
	# The image below shows hardware backend currently supported by TVM:
	#
	# .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/tvm_support_list.png
	# :align: center
	#
	# In this tutorial, we'll choose cuda and llvm as target backends.
	# To begin with, let's import Relay and TVM.

	import numpy as np

	from tvm import relay
	from tvm.relay import testing
	import tvm
	from tvm import te
	from tvm.contrib import graph_runtime

	######################################################################
	# Define Neural Network in Relay
	# ------------------------------
	# First, let's define a neural network with relay python frontend.
	# For simplicity, we'll use pre-defined resnet-18 network in Relay.
	# Parameters are initialized with Xavier initializer.
	# Relay also supports other model formats such as MXNet, CoreML, ONNX and
	# Tensorflow.
	#
	# In this tutorial, we assume we will do inference on our device
	# and the batch size is set to be 1. Input images are RGB color
	# images of size 224 * 224. We can call the :any:`tvm.relay.TupleWrapper.astext()`
	# to show the network structure.

	batch_size = 1
	num_class = 1000
	image_shape = (3, 224, 224)
	data_shape = (batch_size,) + image_shape
	out_shape = (batch_size, num_class)

	mod, params = relay.testing.resnet.get_workload(
	num_layers=18, batch_size=batch_size, image_shape=image_shape
	)

	# set show_meta_data=True if you want to show meta data
	print(mod.astext(show_meta_data=False))

	######################################################################
	# Compilation
	# -----------
	# Next step is to compile the model using the Relay/TVM pipeline.
	# Users can specify the optimization level of the compilation.
	# Currently this value can be 0 to 3. The optimization passes include
	# operator fusion, pre-computation, layout transformation and so on.
	#
	# :py:func:`relay.build` returns three components: the execution graph in
	# json format, the TVM module library of compiled functions specifically
	# for this graph on the target hardware, and the parameter blobs of
	# the model. During the compilation, Relay does the graph-level
	# optimization while TVM does the tensor-level optimization, resulting
	# in an optimized runtime module for model serving.
	#
	# We'll first compile for Nvidia GPU. Behind the scene, :py:func:`relay.build`
	# first does a number of graph-level optimizations, e.g. pruning, fusing, etc.,
	# then registers the operators (i.e. the nodes of the optimized graphs) to
	# TVM implementations to generate a `tvm.module`.
	# To generate the module library, TVM will first transfer the high level IR
	# into the lower intrinsic IR of the specified target backend, which is CUDA
	# in this example. Then the machine code will be generated as the module library.

	opt_level = 3
	target = tvm.target.cuda()
	with tvm.transform.PassContext(opt_level=opt_level):
	lib = relay.build(mod, target, params=params)

	#####################################################################
	# Run the generate library
	# ------------------------
	# Now we can create graph runtime and run the module on Nvidia GPU.

	# create random input
	ctx = tvm.gpu()
	data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
	# create module
	module = graph_runtime.GraphModule(lib["default"](ctx))
	# set input and parameters
	module.set_input("data", data)
	# run
	module.run()
	# get output
	out = module.get_output(0, tvm.nd.empty(out_shape)).asnumpy()

	# Print first 10 elements of output
	print(out.flatten()[0:10])

	######################################################################
	# Save and Load Compiled Module
	# -----------------------------
	# We can also save the graph, lib and parameters into files and load them
	# back in deploy environment.

	####################################################

	# save the graph, lib and params into separate files
	from tvm.contrib import util

	temp = util.tempdir()
	path_lib = temp.relpath("deploy_lib.tar")
	lib.export_library(path_lib)
	print(temp.listdir())

	####################################################

	# load the module back.
	loaded_lib = tvm.runtime.load_module(path_lib)
	input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))

	module = graph_runtime.GraphModule(loaded_lib["default"](ctx))
	module.run(data=input_data)
	out_deploy = module.get_output(0).asnumpy()

	# Print first 10 elements of output
	print(out_deploy.flatten()[0:10])

	# check whether the output from deployed module is consistent with original one
	tvm.testing.assert_allclose(out_deploy, out, atol=1e-3)