| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # ruff: noqa: E402 |
| |
| """ |
| .. _optimize_model: |
| |
| End-to-End Optimize Model |
| ========================= |
| This tutorial demonstrates how to optimize a machine learning model using Apache TVM. We will |
| use a pre-trained ResNet-18 model from PyTorch and end-to-end optimize it using TVM's Relax API. |
| Please note that default end-to-end optimization may not suit complex models. |
| """ |
| |
| ###################################################################### |
| # Preparation |
| # ----------- |
| # First, we prepare the model and input information. We use a pre-trained ResNet-18 model from |
| # PyTorch. |
| |
| import os |
| |
| import numpy as np |
| import torch |
| from torch.export import export |
| from torchvision.models.resnet import ResNet18_Weights, resnet18 |
| |
| torch_model = resnet18(weights=ResNet18_Weights.DEFAULT).eval() |
| |
| ###################################################################### |
| # Review Overall Flow |
| # ------------------- |
| # .. figure:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_overall_flow.svg |
| # :align: center |
| # :width: 80% |
| # |
| # The overall flow consists of the following steps: |
| # |
| # - **Construct or Import a Model**: Construct a neural network model or import a pre-trained |
| # model from other frameworks (e.g. PyTorch, ONNX), and create the TVM IRModule, which contains |
| # all the information needed for compilation, including high-level Relax functions for |
| # computational graph, and low-level TensorIR functions for tensor program. |
| # - **Perform Composable Optimizations**: Perform a series of optimization transformations, |
| # such as graph optimizations, tensor program optimizations, and library dispatching. |
| # - **Build and Universal Deployment**: Build the optimized model to a deployable module to the |
| # universal runtime, and execute it on different devices, such as CPU, GPU, or other accelerators. |
| # |
| |
| |
| ###################################################################### |
| # Convert the model to IRModule |
| # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| # Next step, we convert the model to an IRModule using the Relax frontend for PyTorch for further |
| # optimization. |
| |
| import tvm |
| from tvm import relax |
| from tvm.relax.frontend.torch import from_exported_program |
| |
| # Give an example argument to torch.export |
| example_args = (torch.randn(1, 3, 224, 224, dtype=torch.float32),) |
| |
| # Skip running in CI environment |
| IS_IN_CI = os.getenv("CI", "") == "true" |
| |
| if not IS_IN_CI: |
| # Convert the model to IRModule |
| with torch.no_grad(): |
| exported_program = export(torch_model, example_args) |
| mod = from_exported_program(exported_program, keep_params_as_input=True) |
| |
| mod, params = relax.frontend.detach_params(mod) |
| mod.show() |
| |
| ###################################################################### |
| # IRModule Optimization |
| # --------------------- |
| # Apache TVM provides a flexible way to optimize the IRModule. Everything centered |
| # around IRModule optimization can be composed with existing pipelines. Note that each |
| # transformation can be combined as an optimization pipeline via ``tvm.ir.transform.Sequential``. |
| # |
| # In this tutorial, we focus on the end-to-end optimization of the model via auto-tuning. We |
| # leverage MetaSchedule to tune the model and store the tuning logs to the database. We also |
| # apply the database to the model to get the best performance. |
| # |
| # The ResNet18 model will be divided into 20 independent tuning tasks during compilation. |
| # To ensure each task receives adequate tuning resources in one iteration while providing |
| # early feedback: |
| # |
| # - To quickly observe tuning progress, each task is allocated a maximum of 16 trials per |
| # iteration (controlled by ``MAX_TRIALS_PER_TASK=16``). We should set ``TOTAL_TRIALS`` |
| # to at least ``320 (20 tasks * 16 trials)`` ensures every task receives one full iteration |
| # of tuning. We set it to 512 in our configuration to allow for several more iterations, |
| # aiming to explore a wider parameter space and potentially achieve better performance. |
| # - If ``MAX_TRIALS_PER_TASK == None``, the system defaults to ``TOTAL_TRIALS`` trials per |
| # task per iteration. An insufficient ``TOTAL_TRIALS`` setting may lead to undersubscribed |
| # tuning, potentially skipping some tasks entirely. Explicitly setting both parameters |
| # avoids this issue and provides deterministic resource allocation across all tasks. |
| # |
| # Note: These parameter settings are optimized for quick tutorial demonstration. For production |
| # deployments requiring higher performance, we recommend adjusting both ``MAX_TRIALS_PER_TASK`` |
| # and ``TOTAL_TRIALS`` to larger values. This allows more extensive search space exploration |
| # and typically yields better performance outcomes. |
| |
| TOTAL_TRIALS = 512 # Change to 20000 for better performance if needed |
| MAX_TRIALS_PER_TASK = 16 # Change to more trials per task for better performance if needed |
| target = tvm.target.Target("nvidia/geforce-rtx-3090-ti") # Change to your target device |
| work_dir = "tuning_logs" |
| |
| if not IS_IN_CI: |
| mod = relax.get_pipeline( |
| "static_shape_tuning", |
| target=target, |
| work_dir=work_dir, |
| total_trials=TOTAL_TRIALS, |
| max_trials_per_task=MAX_TRIALS_PER_TASK, |
| )(mod) |
| |
| # Only show the main function |
| mod["main"].show() |
| |
| ###################################################################### |
| # Build and Deploy |
| # ---------------- |
| # Finally, we build the optimized model and deploy it to the target device. |
| # We skip this step in the CI environment. |
| |
| if not IS_IN_CI: |
| with target: |
| mod = tvm.s_tir.transform.DefaultGPUSchedule()(mod) |
| ex = tvm.compile(mod, target=target) |
| dev = tvm.device("cuda", 0) |
| vm = relax.VirtualMachine(ex, dev) |
| # Need to allocate data and params on GPU device |
| gpu_data = tvm.runtime.tensor(np.random.rand(1, 3, 224, 224).astype("float32"), dev) |
| gpu_params = [tvm.runtime.tensor(p, dev) for p in params["main"]] |
| gpu_out = vm["main"](gpu_data, *gpu_params)[0].numpy() |
| |
| print(gpu_out.shape) |