docs/get_started/tutorials/quick_start.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 """
 .. _quick_start:

 Quick Start
 ===========

 This tutorial is for people who are new to Apache TVM. Taking an simple example
 to show how to use Apache TVM to compile a simple neural network.

 .. contents:: Table of Contents
     :local:
     :depth: 2

 """

 ################################################################################
 # Overview
 # --------
 # Apache TVM is a machine learning compilation framework, following the principle of
 # **Python-first development** and **universal deployment**. It takes in pre-trained
 # machine learning models, compiles and generates deployable modules that can be embedded
 # and run everywhere.
 # Apache TVM also enables customizing optimization processes to introduce new optimizations,
 # libraries, codegen and more.
 #
 # Apache TVM can help to:
 #
 # - **Optimize** performance of ML workloads, composing libraries and codegen.
 # - **Deploy** ML workloads to a diverse set of new environments, including new runtime and new
 #   hardware.
 # - **Continuously improve and customize** ML deployment pipeline in Python by quickly customizing
 #   library dispatching, bringing in customized operators and code generation.

 ################################################################################
 # Overall Flow
 # ------------
 # Then we will show the overall flow of using Apache TVM to compile a neural network model,
 # showing how to optimize, deploy and run the model.
 # The overall flow is illustrated as the figure:
 #
 # .. figure:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_overall_flow.svg
 #    :align: center
 #    :width: 80%
 #
 # The overall flow consists of the following steps:
 #
 # - **Construct or Import a Model**: Construct a neural network model or import a pre-trained
 #   model from other frameworks (e.g. PyTorch, ONNX), and create the TVM IRModule, which contains
 #   all the information needed for compilation, including high-level Relax functions for
 #   computational graph, and low-level TensorIR functions for tensor program.
 # - **Perform Composable Optimizations**: Perform a series of optimization transformations,
 #   such as graph optimizations, tensor program optimizations, and library dispatching.
 # - **Build and Universal Deployment**: Build the optimized model to a deployable module to the
 #   universal runtime, and execute it on different devices, such as CPU, GPU, or other accelerators.

 ################################################################################
 # Construct or Import a Model
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # Before we get started, let's construct a neural network model first.
 # In this tutorial, to make things simple, we will define a two-layer MLP network
 # directly in this script with the TVM Relax frontend, which is a similar API to PyTorch.
 #

 import tvm
 from tvm import relax
 from tvm.relax.frontend import nn


 class MLPModel(nn.Module):
     def __init__(self):
         super(MLPModel, self).__init__()
         self.fc1 = nn.Linear(784, 256)
         self.relu1 = nn.ReLU()
         self.fc2 = nn.Linear(256, 10)

     def forward(self, x):
         x = self.fc1(x)
         x = self.relu1(x)
         x = self.fc2(x)
         return x


 ################################################################################
 # Then we can export the model to TVM IRModule, which is the central intermediate representation
 # in TVM.

 mod, param_spec = MLPModel().export_tvm(
     spec={"forward": {"x": nn.spec.Tensor((1, 784), "float32")}}
 )
 mod.show()

 ################################################################################
 # Perform Optimization Transformations
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # Apache TVM leverage ``pipeline`` to transform and optimize program.
 # The pipeline encapsulates a collection of transformation that gets two goals (at the same level):
 #
 # - **Model optimizations**: such as operator fusion, layout rewrites.
 # - **Tensor program optimization**: Map the operators to low-level implementations
 #   (both library or codegen)
 #
 # .. note::
 #   The twos are goals but not the stages of the pipeline. The two optimizations are performed
 #   **at the same level**, or separately in two stages.
 #
 # .. note::
 #   In this tutorial we only demonstrate the overall flow, by leverage ``zero`` optimization
 #   pipeline, instead of optimizing for any specific target.

 mod = relax.get_pipeline("zero")(mod)


 ################################################################################
 # Build and Universal Deployment
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # After the optimization, we can build the model to a deployable module and run it on
 # different devices.


 import numpy as np

 target = tvm.target.Target("llvm")
 ex = tvm.compile(mod, target)
 device = tvm.cpu()
 vm = relax.VirtualMachine(ex, device)
 data = np.random.rand(1, 784).astype("float32")
 tvm_data = tvm.runtime.tensor(data, device=device)
 params = [np.random.rand(*param.shape).astype("float32") for _, param in param_spec]
 params = [tvm.runtime.tensor(param, device=device) for param in params]
 print(vm["forward"](tvm_data, *params).numpy())

 ################################################################################
 # Our goal is to bring machine learning to the application with any language of interest,
 # with the minimum runtime support.
 #
 # - Each function in IRModule becomes a runnable function in the runtime. For example in LLM
 #   cases, we can call ``prefill`` and ``decode`` functions directly.
 #
 #   .. code-block:: Python
 #
 #       prefill_logits = vm["prefill"](inputs, weight, kv_cache)
 #       decoded_logits = vm["decode"](inputs, weight, kv_cache)
 #
 # - TVM runtime comes with native data structures, such as Tensor, can also have zero
 #   copy exchange with existing ecosystem (DLPack exchange with PyTorch)
 #
 #   .. code-block:: Python
 #
 #       # Convert PyTorch tensor to TVM Tensor
 #       x_tvm = tvm.runtime.from_dlpack(x_torch)
 #       # Convert TVM Tensor to PyTorch tensor
 #       x_torch = torch.from_dlpack(x_tvm)
 #
 # - TVM runtime works in non-python environments, so it works on settings such as mobile
 #
 #   .. code-block:: C++
 #
 #       // C++ snippet
 #       runtime::Module vm = ex.GetFunction("load_executable")();
 #       vm.GetFunction("init")(...);
 #       Tensor out = vm.GetFunction("prefill")(data, weight, kv_cache);
 #
 #   .. code-block:: Java
 #
 #       // Java snippet
 #       Module vm = ex.getFunction("load_executable").invoke();
 #       vm.getFunction("init").pushArg(...).invoke;
 #       Tensor out = vm.getFunction("prefill").pushArg(data).pushArg(weight).pushArg(kv_cache).invoke();
 #

 ################################################################################
 # Read next
 # ---------
 # This tutorial demonstrates the overall flow of using Apache TVM to compile a neural network model.
 # For more advanced or specific topics, please refer to the following tutorials
 #
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	"""
	.. _quick_start:

	Quick Start
	===========

	This tutorial is for people who are new to Apache TVM. Taking an simple example
	to show how to use Apache TVM to compile a simple neural network.

	.. contents:: Table of Contents
	:local:
	:depth: 2

	"""

	################################################################################
	# Overview
	# --------
	# Apache TVM is a machine learning compilation framework, following the principle of
	# Python-first development and universal deployment. It takes in pre-trained
	# machine learning models, compiles and generates deployable modules that can be embedded
	# and run everywhere.
	# Apache TVM also enables customizing optimization processes to introduce new optimizations,
	# libraries, codegen and more.
	#
	# Apache TVM can help to:
	#
	# - Optimize performance of ML workloads, composing libraries and codegen.
	# - Deploy ML workloads to a diverse set of new environments, including new runtime and new
	# hardware.
	# - Continuously improve and customize ML deployment pipeline in Python by quickly customizing
	# library dispatching, bringing in customized operators and code generation.

	################################################################################
	# Overall Flow
	# ------------
	# Then we will show the overall flow of using Apache TVM to compile a neural network model,
	# showing how to optimize, deploy and run the model.
	# The overall flow is illustrated as the figure:
	#
	# .. figure:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_overall_flow.svg
	# :align: center
	# :width: 80%
	#
	# The overall flow consists of the following steps:
	#
	# - Construct or Import a Model: Construct a neural network model or import a pre-trained
	# model from other frameworks (e.g. PyTorch, ONNX), and create the TVM IRModule, which contains
	# all the information needed for compilation, including high-level Relax functions for
	# computational graph, and low-level TensorIR functions for tensor program.
	# - Perform Composable Optimizations: Perform a series of optimization transformations,
	# such as graph optimizations, tensor program optimizations, and library dispatching.
	# - Build and Universal Deployment: Build the optimized model to a deployable module to the
	# universal runtime, and execute it on different devices, such as CPU, GPU, or other accelerators.

	################################################################################
	# Construct or Import a Model
	# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Before we get started, let's construct a neural network model first.
	# In this tutorial, to make things simple, we will define a two-layer MLP network
	# directly in this script with the TVM Relax frontend, which is a similar API to PyTorch.
	#

	import tvm
	from tvm import relax
	from tvm.relax.frontend import nn


	class MLPModel(nn.Module):
	def __init__(self):
	super(MLPModel, self).__init__()
	self.fc1 = nn.Linear(784, 256)
	self.relu1 = nn.ReLU()
	self.fc2 = nn.Linear(256, 10)

	def forward(self, x):
	x = self.fc1(x)
	x = self.relu1(x)
	x = self.fc2(x)
	return x


	################################################################################
	# Then we can export the model to TVM IRModule, which is the central intermediate representation
	# in TVM.

	mod, param_spec = MLPModel().export_tvm(
	spec={"forward": {"x": nn.spec.Tensor((1, 784), "float32")}}
	)
	mod.show()

	################################################################################
	# Perform Optimization Transformations
	# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Apache TVM leverage ``pipeline`` to transform and optimize program.
	# The pipeline encapsulates a collection of transformation that gets two goals (at the same level):
	#
	# - Model optimizations: such as operator fusion, layout rewrites.
	# - Tensor program optimization: Map the operators to low-level implementations
	# (both library or codegen)
	#
	# .. note::
	# The twos are goals but not the stages of the pipeline. The two optimizations are performed
	# at the same level, or separately in two stages.
	#
	# .. note::
	# In this tutorial we only demonstrate the overall flow, by leverage ``zero`` optimization
	# pipeline, instead of optimizing for any specific target.

	mod = relax.get_pipeline("zero")(mod)


	################################################################################
	# Build and Universal Deployment
	# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# After the optimization, we can build the model to a deployable module and run it on
	# different devices.


	import numpy as np

	target = tvm.target.Target("llvm")
	ex = tvm.compile(mod, target)
	device = tvm.cpu()
	vm = relax.VirtualMachine(ex, device)
	data = np.random.rand(1, 784).astype("float32")
	tvm_data = tvm.runtime.tensor(data, device=device)
	params = [np.random.rand(*param.shape).astype("float32") for _, param in param_spec]
	params = [tvm.runtime.tensor(param, device=device) for param in params]
	print(vm["forward"](tvm_data, *params).numpy())

	################################################################################
	# Our goal is to bring machine learning to the application with any language of interest,
	# with the minimum runtime support.
	#
	# - Each function in IRModule becomes a runnable function in the runtime. For example in LLM
	# cases, we can call ``prefill`` and ``decode`` functions directly.
	#
	# .. code-block:: Python
	#
	# prefill_logits = vm["prefill"](inputs, weight, kv_cache)
	# decoded_logits = vm["decode"](inputs, weight, kv_cache)
	#
	# - TVM runtime comes with native data structures, such as Tensor, can also have zero
	# copy exchange with existing ecosystem (DLPack exchange with PyTorch)
	#
	# .. code-block:: Python
	#
	# # Convert PyTorch tensor to TVM Tensor
	# x_tvm = tvm.runtime.from_dlpack(x_torch)
	# # Convert TVM Tensor to PyTorch tensor
	# x_torch = torch.from_dlpack(x_tvm)
	#
	# - TVM runtime works in non-python environments, so it works on settings such as mobile
	#
	# .. code-block:: C++
	#
	# // C++ snippet
	# runtime::Module vm = ex.GetFunction("load_executable")();
	# vm.GetFunction("init")(...);
	# Tensor out = vm.GetFunction("prefill")(data, weight, kv_cache);
	#
	# .. code-block:: Java
	#
	# // Java snippet
	# Module vm = ex.getFunction("load_executable").invoke();
	# vm.getFunction("init").pushArg(...).invoke;
	# Tensor out = vm.getFunction("prefill").pushArg(data).pushArg(weight).pushArg(kv_cache).invoke();
	#

	################################################################################
	# Read next
	# ---------
	# This tutorial demonstrates the overall flow of using Apache TVM to compile a neural network model.
	# For more advanced or specific topics, please refer to the following tutorials
	#