docs/how_to/tutorials/customize_opt.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 # ruff: noqa: E402, E501, F401

 """
 .. _customize_opt:

 Customize Optimization
 ======================
 One main design goal of Apache TVM is to enable easy customization of the optimization pipeline
 for both research or development purposes and iterate the engineering optimizations. In this
 tutorial we will

 .. contents:: Table of Contents
     :local:
     :depth: 1
 """

 ######################################################################
 # Review Overall Flow
 # -------------------
 # .. figure:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_overall_flow.svg
 #    :align: center
 #    :width: 80%
 #
 # The overall flow consists of the following steps:
 #
 # - **Construct or Import a Model**: Construct a neural network model or import a pre-trained
 #   model from other frameworks (e.g. PyTorch, ONNX), and create the TVM IRModule, which contains
 #   all the information needed for compilation, including high-level Relax functions for
 #   computational graph, and low-level TensorIR functions for tensor program.
 # - **Perform Composable Optimizations**: Perform a series of optimization transformations,
 #   such as graph optimizations, tensor program optimizations, and library dispatching.
 # - **Build and Universal Deployment**: Build the optimized model to a deployable module to the
 #   universal runtime, and execute it on different devices, such as CPU, GPU, or other accelerators.
 #

 import os
 import tempfile

 import numpy as np

 import tvm
 from tvm import IRModule, relax
 from tvm.relax.frontend import nn

 ######################################################################
 # Composable IRModule Optimization
 # --------------------------------
 # Apache TVM provides a flexible way to optimize the IRModule. Everything centered
 # around IRModule optimization can be composed with existing pipelines. Note that each optimization
 # can focus on **part of the computation graph**, enabling partial lowering or partial optimization.
 #
 # In this tutorial, we will demonstrate how to optimize a model with Apache TVM.

 ######################################################################
 # Prepare a Relax Module
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~
 # We first prepare a Relax module. The module can be imported from other frameworks, constructed
 # with NN module frontend or TVMScript. Here we use a simple neural network model as an example.


 class RelaxModel(nn.Module):
     def __init__(self):
         super().__init__()
         self.fc1 = nn.Linear(784, 256)
         self.relu1 = nn.ReLU()
         self.fc2 = nn.Linear(256, 10, bias=False)

     def forward(self, x):
         x = self.fc1(x)
         x = self.relu1(x)
         x = self.fc2(x)
         return x


 input_shape = (1, 784)
 mod, params = RelaxModel().export_tvm({"forward": {"x": nn.spec.Tensor(input_shape, "float32")}})
 mod.show()

 ######################################################################
 # Library Dispatch
 # ~~~~~~~~~~~~~~~~
 # We would like to quickly try out a variant of library optimization for certain platforms
 # (e.g., GPU). We can write a certain dispatching pass for the specific platform and
 # operator. Here we demonstrate how to dispatch the CUBLAS library for certain patterns.
 #
 # .. note::
 #   This tutorial only demonstrates a single operator dispatching for CUBLAS, highlighting
 #   the flexibility of the optimization pipeline. In real-world cases, we can import multiple
 #   patterns and dispatch them to different kernels.


 # Import cublas pattern
 try:
     import tvm.relax.backend.cuda.cublas as _cublas
 except ImportError as e:
     raise ImportError(
         "This tutorial requires TVM built with CUDA support.\n"
         "If you hit missing 'tvm_ffi', try: pip install apache-tvm-ffi\n"
         "Otherwise build TVM with CUDA enabled:\n"
         "  https://tvm.apache.org/docs/install/from_source.html\n"
         f"Original error: {e}"
     ) from e


 # Define a new pass for CUBLAS dispatch
 @tvm.transform.module_pass(opt_level=0, name="CublasDispatch")
 class CublasDispatch:
     def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassContext) -> IRModule:
         # Check if CUBLAS is enabled
         if not tvm.get_global_func("relax.ext.cublas", True):
             raise Exception("CUBLAS is not enabled.")

         # Get interested patterns
         patterns = [relax.backend.get_pattern("cublas.matmul_transposed_bias_relu")]
         # Note in real-world cases, we usually get all patterns
         # patterns = relax.backend.get_patterns_with_prefix("cublas")

         # Fuse ops by patterns and then run codegen
         mod = relax.transform.FuseOpsByPattern(patterns, annotate_codegen=True)(mod)
         mod = relax.transform.RunCodegen()(mod)
         return mod


 mod = CublasDispatch()(mod)
 mod.show()

 ######################################################################
 # After the dispatching pass, we can see that the first ``nn.Linear`` and ``nn.ReLU`` are fused
 # and rewritten to a ``call_dps_packed`` function which call the CUBLAS library. Notably, the
 # other part is not changed, which means we can selectively dispatch the optimization for
 # certain computation.

 ######################################################################
 # Auto Tuning
 # ~~~~~~~~~~~
 # Continuing from the previous example, we can further optimize the model with auto-tuning for
 # the **rest part of the computation**. Here we demonstrate how to use the meta-schedule to auto-tune
 # the model.
 #
 # We can use ``MetaScheduleTuneTIR`` pass to simply tuning the model, while ``MetaScheduleApplyDatabase``
 # pass to apply the best configuration to the model. The tuning process will generate search space,
 # tune the model and the following steps will apply the best configuration to the model. Before
 # running the passes, we need to lowering relax operator into TensorIR functions via ``LegalizeOps``
 #
 # .. note::
 #
 #   To save CI time and avoid flakiness, we skip the tuning process in CI environment.
 #

 device = tvm.cuda(0)
 target = tvm.target.Target.from_device(device)
 if os.getenv("CI", "") != "true":
     trials = 2000
     with target, tempfile.TemporaryDirectory() as tmp_dir:
         mod = tvm.ir.transform.Sequential(
             [
                 relax.get_pipeline("zero"),
                 relax.transform.MetaScheduleTuneTIR(work_dir=tmp_dir, max_trials_global=trials),
                 relax.transform.MetaScheduleApplyDatabase(work_dir=tmp_dir),
             ]
         )(mod)

     mod.show()

 ######################################################################
 # DLight Rules
 # ~~~~~~~~~~~~
 # DLight rules are a set of default rules for scheduling and optimization the kernel.
 # DLight rules are designed for fast compilation and **fair** performance. In some cases,
 # e.g. language model, DLight provides excellent performance, while for generic models,
 # it achieves a balance between performance and compilation time.

 from tvm.s_tir import dlight as dl

 # Apply DLight rules
 with target:
     mod = tvm.ir.transform.Sequential(
         [
             relax.get_pipeline("zero"),
             dl.ApplyDefaultSchedule(  # pylint: disable=not-callable
                 dl.gpu.Matmul(),
                 dl.gpu.GEMV(),
                 dl.gpu.Reduction(),
                 dl.gpu.GeneralReduction(),
                 dl.gpu.Fallback(),
             ),
         ]
     )(mod)

 mod.show()

 ######################################################################
 # .. note::
 #
 #   This tutorial focuses on the demonstration of the optimization pipeline, instead of
 #   pushing the performance to the limit. The current optimization may not be the best.


 ######################################################################
 # Deploy the Optimized Model
 # --------------------------
 # We can build and deploy the optimized model to the TVM runtime.

 ex = tvm.compile(mod, target="cuda")
 dev = tvm.device("cuda", 0)
 vm = relax.VirtualMachine(ex, dev)
 # Need to allocate data and params on GPU device
 data = tvm.runtime.tensor(np.random.rand(*input_shape).astype("float32"), dev)
 gpu_params = [tvm.runtime.tensor(np.random.rand(*p.shape).astype(p.dtype), dev) for _, p in params]
 gpu_out = vm["forward"](data, *gpu_params).numpy()
 print(gpu_out)


 ######################################################################
 # Summary
 # -------
 # This tutorial demonstrates how to customize the optimization pipeline for ML models in Apache TVM.
 # We can easily compose the optimization passes and customize the optimization for different parts
 # of the computation graph. The flexibility of the optimization pipeline enables us to quickly
 # iterate the optimization and improve the performance of the model.
 #
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	# ruff: noqa: E402, E501, F401

	"""
	.. _customize_opt:

	Customize Optimization
	======================
	One main design goal of Apache TVM is to enable easy customization of the optimization pipeline
	for both research or development purposes and iterate the engineering optimizations. In this
	tutorial we will

	.. contents:: Table of Contents
	:local:
	:depth: 1
	"""

	######################################################################
	# Review Overall Flow
	# -------------------
	# .. figure:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_overall_flow.svg
	# :align: center
	# :width: 80%
	#
	# The overall flow consists of the following steps:
	#
	# - Construct or Import a Model: Construct a neural network model or import a pre-trained
	# model from other frameworks (e.g. PyTorch, ONNX), and create the TVM IRModule, which contains
	# all the information needed for compilation, including high-level Relax functions for
	# computational graph, and low-level TensorIR functions for tensor program.
	# - Perform Composable Optimizations: Perform a series of optimization transformations,
	# such as graph optimizations, tensor program optimizations, and library dispatching.
	# - Build and Universal Deployment: Build the optimized model to a deployable module to the
	# universal runtime, and execute it on different devices, such as CPU, GPU, or other accelerators.
	#

	import os
	import tempfile

	import numpy as np

	import tvm
	from tvm import IRModule, relax
	from tvm.relax.frontend import nn

	######################################################################
	# Composable IRModule Optimization
	# --------------------------------
	# Apache TVM provides a flexible way to optimize the IRModule. Everything centered
	# around IRModule optimization can be composed with existing pipelines. Note that each optimization
	# can focus on part of the computation graph, enabling partial lowering or partial optimization.
	#
	# In this tutorial, we will demonstrate how to optimize a model with Apache TVM.

	######################################################################
	# Prepare a Relax Module
	# ~~~~~~~~~~~~~~~~~~~~~~~~~~
	# We first prepare a Relax module. The module can be imported from other frameworks, constructed
	# with NN module frontend or TVMScript. Here we use a simple neural network model as an example.


	class RelaxModel(nn.Module):
	def __init__(self):
	super().__init__()
	self.fc1 = nn.Linear(784, 256)
	self.relu1 = nn.ReLU()
	self.fc2 = nn.Linear(256, 10, bias=False)

	def forward(self, x):
	x = self.fc1(x)
	x = self.relu1(x)
	x = self.fc2(x)
	return x


	input_shape = (1, 784)
	mod, params = RelaxModel().export_tvm({"forward": {"x": nn.spec.Tensor(input_shape, "float32")}})
	mod.show()

	######################################################################
	# Library Dispatch
	# ~~~~~~~~~~~~~~~~
	# We would like to quickly try out a variant of library optimization for certain platforms
	# (e.g., GPU). We can write a certain dispatching pass for the specific platform and
	# operator. Here we demonstrate how to dispatch the CUBLAS library for certain patterns.
	#
	# .. note::
	# This tutorial only demonstrates a single operator dispatching for CUBLAS, highlighting
	# the flexibility of the optimization pipeline. In real-world cases, we can import multiple
	# patterns and dispatch them to different kernels.


	# Import cublas pattern
	try:
	import tvm.relax.backend.cuda.cublas as _cublas
	except ImportError as e:
	raise ImportError(
	"This tutorial requires TVM built with CUDA support.\n"
	"If you hit missing 'tvm_ffi', try: pip install apache-tvm-ffi\n"
	"Otherwise build TVM with CUDA enabled:\n"
	" https://tvm.apache.org/docs/install/from_source.html\n"
	f"Original error: {e}"
	) from e


	# Define a new pass for CUBLAS dispatch
	@tvm.transform.module_pass(opt_level=0, name="CublasDispatch")
	class CublasDispatch:
	def transform_module(self, mod: IRModule, _ctx: tvm.transform.PassContext) -> IRModule:
	# Check if CUBLAS is enabled
	if not tvm.get_global_func("relax.ext.cublas", True):
	raise Exception("CUBLAS is not enabled.")

	# Get interested patterns
	patterns = [relax.backend.get_pattern("cublas.matmul_transposed_bias_relu")]
	# Note in real-world cases, we usually get all patterns
	# patterns = relax.backend.get_patterns_with_prefix("cublas")

	# Fuse ops by patterns and then run codegen
	mod = relax.transform.FuseOpsByPattern(patterns, annotate_codegen=True)(mod)
	mod = relax.transform.RunCodegen()(mod)
	return mod


	mod = CublasDispatch()(mod)
	mod.show()

	######################################################################
	# After the dispatching pass, we can see that the first ``nn.Linear`` and ``nn.ReLU`` are fused
	# and rewritten to a ``call_dps_packed`` function which call the CUBLAS library. Notably, the
	# other part is not changed, which means we can selectively dispatch the optimization for
	# certain computation.

	######################################################################
	# Auto Tuning
	# ~~~~~~~~~~~
	# Continuing from the previous example, we can further optimize the model with auto-tuning for
	# the rest part of the computation. Here we demonstrate how to use the meta-schedule to auto-tune
	# the model.
	#
	# We can use ``MetaScheduleTuneTIR`` pass to simply tuning the model, while ``MetaScheduleApplyDatabase``
	# pass to apply the best configuration to the model. The tuning process will generate search space,
	# tune the model and the following steps will apply the best configuration to the model. Before
	# running the passes, we need to lowering relax operator into TensorIR functions via ``LegalizeOps``
	#
	# .. note::
	#
	# To save CI time and avoid flakiness, we skip the tuning process in CI environment.
	#

	device = tvm.cuda(0)
	target = tvm.target.Target.from_device(device)
	if os.getenv("CI", "") != "true":
	trials = 2000
	with target, tempfile.TemporaryDirectory() as tmp_dir:
	mod = tvm.ir.transform.Sequential(
	[
	relax.get_pipeline("zero"),
	relax.transform.MetaScheduleTuneTIR(work_dir=tmp_dir, max_trials_global=trials),
	relax.transform.MetaScheduleApplyDatabase(work_dir=tmp_dir),
	]
	)(mod)

	mod.show()

	######################################################################
	# DLight Rules
	# ~~~~~~~~~~~~
	# DLight rules are a set of default rules for scheduling and optimization the kernel.
	# DLight rules are designed for fast compilation and fair performance. In some cases,
	# e.g. language model, DLight provides excellent performance, while for generic models,
	# it achieves a balance between performance and compilation time.

	from tvm.s_tir import dlight as dl

	# Apply DLight rules
	with target:
	mod = tvm.ir.transform.Sequential(
	[
	relax.get_pipeline("zero"),
	dl.ApplyDefaultSchedule( # pylint: disable=not-callable
	dl.gpu.Matmul(),
	dl.gpu.GEMV(),
	dl.gpu.Reduction(),
	dl.gpu.GeneralReduction(),
	dl.gpu.Fallback(),
	),
	]
	)(mod)

	mod.show()

	######################################################################
	# .. note::
	#
	# This tutorial focuses on the demonstration of the optimization pipeline, instead of
	# pushing the performance to the limit. The current optimization may not be the best.


	######################################################################
	# Deploy the Optimized Model
	# --------------------------
	# We can build and deploy the optimized model to the TVM runtime.

	ex = tvm.compile(mod, target="cuda")
	dev = tvm.device("cuda", 0)
	vm = relax.VirtualMachine(ex, dev)
	# Need to allocate data and params on GPU device
	data = tvm.runtime.tensor(np.random.rand(*input_shape).astype("float32"), dev)
	gpu_params = [tvm.runtime.tensor(np.random.rand(*p.shape).astype(p.dtype), dev) for _, p in params]
	gpu_out = vm["forward"](data, *gpu_params).numpy()
	print(gpu_out)


	######################################################################
	# Summary
	# -------
	# This tutorial demonstrates how to customize the optimization pipeline for ML models in Apache TVM.
	# We can easily compose the optimization passes and customize the optimization for different parts
	# of the computation graph. The flexibility of the optimization pipeline enables us to quickly
	# iterate the optimization and improve the performance of the model.
	#