blob: 7378b3c71c17585b1684ab9020f00adb459f1944 [file]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# ruff: noqa: E402
"""
.. _deploy_export_and_load_executable:
Export and Load Relax Executables
=================================
This tutorial walks through exporting a compiled Relax module to a shared
object, loading it back into the TVM runtime, and running the result either
interactively or from a standalone script. This tutorial demonstrates how
to turn Relax (or imported PyTorch / ONNX) programs into deployable artifacts
using ``tvm.relax`` APIs.
.. note::
This tutorial uses PyTorch as the source format, but the export/load workflow
is the same for ONNX models. For ONNX, use ``from_onnx(model, keep_params_in_input=True)``
instead of ``from_exported_program()``, then follow the same steps for building,
exporting, and loading.
"""
######################################################################
# Introduction
# ------------
# TVM builds Relax programs into ``tvm.runtime.Executable`` objects. These
# contain VM bytecode, compiled kernels, and constants. By exporting the
# executable with :py:meth:`export_library`, you obtain a shared library (for
# example ``.so`` on Linux) that can be shipped to another machine, uploaded
# via RPC, or loaded back later with the TVM runtime. This tutorial shows the
# exact steps end-to-end and explains what files are produced along the way.
import os
from pathlib import Path
try:
import torch
from torch.export import export
except ImportError: # pragma: no cover
torch = None # type: ignore
######################################################################
# Prepare a Torch MLP and Convert to Relax
# ----------------------------------------
# We start with a small PyTorch MLP so the example remains lightweight. The
# model is exported to a :py:class:`torch.export.ExportedProgram` and then
# translated into a Relax ``IRModule``.
import tvm
from tvm import relax
from tvm.relax.frontend.torch import from_exported_program
# Check dependencies first
IS_IN_CI = os.getenv("CI", "").lower() == "true"
HAS_TORCH = torch is not None
RUN_EXAMPLE = HAS_TORCH and not IS_IN_CI
if HAS_TORCH:
class TorchMLP(torch.nn.Module):
def __init__(self) -> None:
super().__init__()
self.net = torch.nn.Sequential(
torch.nn.Flatten(),
torch.nn.Linear(28 * 28, 128),
torch.nn.ReLU(),
torch.nn.Linear(128, 10),
)
def forward(self, data: torch.Tensor) -> torch.Tensor: # type: ignore[override]
return self.net(data)
else: # pragma: no cover
TorchMLP = None # type: ignore[misc, assignment]
if RUN_EXAMPLE:
torch_model = TorchMLP().eval()
example_args = (torch.randn(1, 1, 28, 28, dtype=torch.float32),)
with torch.no_grad():
exported_program = export(torch_model, example_args)
mod = from_exported_program(exported_program, keep_params_as_input=True)
# Separate model parameters so they can be bound later (or stored on disk).
mod, params = relax.frontend.detach_params(mod)
print("Imported Relax module:")
mod.show()
######################################################################
# Build and Export with ``export_library``
# -------------------------------------------
# We build for ``llvm`` to generate CPU code and then export the resulting
# executable. Passing ``workspace_dir`` keeps the intermediate packaging files,
# which is useful to inspect what was produced.
TARGET = tvm.target.Target("llvm")
ARTIFACT_DIR = Path("relax_export_artifacts")
ARTIFACT_DIR.mkdir(exist_ok=True)
if RUN_EXAMPLE:
# Apply the default Relax compilation pipeline before building.
pipeline = relax.get_pipeline()
with TARGET:
built_mod = pipeline(mod)
# Build without params - we'll pass them at runtime
executable = tvm.compile(built_mod, target=TARGET)
library_path = ARTIFACT_DIR / "mlp_cpu.so"
executable.export_library(str(library_path), workspace_dir=str(ARTIFACT_DIR))
print(f"Exported runtime library to: {library_path}")
# The workspace directory now contains the shared object and supporting files.
produced_files = sorted(p.name for p in ARTIFACT_DIR.iterdir())
print("Artifacts saved:")
for name in produced_files:
print(f" - {name}")
# Generated files:
# - ``mlp_cpu.so``: The main deployable shared library containing VM bytecode,
# compiled kernels, and constants. Note: Since parameters are passed at runtime,
# you will also need to save a separate parameters file (see next section).
# - Intermediate object files (``devc.o``, ``lib0.o``, etc.) are kept in the
# workspace for inspection but are not required for deployment.
#
# Note: Additional files like ``*.params``, ``*.metadata.json``, or ``*.imports``
# may appear in specific configurations but are typically embedded into the
# shared library or only generated when needed.
######################################################################
# Load the Exported Library and Run It
# ------------------------------------
# Once the shared object is produced, we can reload it back into the TVM runtime
# on any machine with a compatible instruction set. The Relax VM consumes the
# runtime module directly.
if RUN_EXAMPLE:
loaded_rt_mod = tvm.runtime.load_module(str(library_path))
dev = tvm.cpu(0)
vm = relax.VirtualMachine(loaded_rt_mod, dev)
# Prepare input data
input_tensor = torch.randn(1, 1, 28, 28, dtype=torch.float32)
vm_input = tvm.runtime.tensor(input_tensor.numpy(), dev)
# Prepare parameters (allocate on target device)
vm_params = [tvm.runtime.tensor(p, dev) for p in params["main"]]
# Run inference: pass input data followed by all parameters
tvm_output = vm["main"](vm_input, *vm_params)
# TVM returns Array objects for tuple outputs, access via indexing.
# For models imported from PyTorch, outputs are typically tuples (even for single outputs).
# For ONNX models, outputs may be a single Tensor directly.
if isinstance(tvm_output, tvm.ir.Array) and len(tvm_output) > 0:
result_tensor = tvm_output[0]
else:
result_tensor = tvm_output
print("VM output shape:", result_tensor.shape)
print("VM output type:", type(tvm_output), "->", type(result_tensor))
# You can still inspect the executable after reloading.
print("Executable stats:\n", loaded_rt_mod["stats"]())
######################################################################
# Save Parameters for Deployment
# -------------------------------
# Since parameters are passed at runtime (not embedded in the ``.so``), we must
# save them separately for deployment. This is a required step to use the model
# on other machines or in standalone scripts.
import numpy as np
if RUN_EXAMPLE:
# Save parameters to disk
params_path = ARTIFACT_DIR / "model_params.npz"
param_arrays = {f"p_{i}": p.numpy() for i, p in enumerate(params["main"])}
np.savez(str(params_path), **param_arrays)
print(f"Saved parameters to: {params_path}")
# Note: Alternatively, you can embed parameters directly into the ``.so`` to
# create a single-file deployment. Use ``keep_params_as_input=False`` when
# importing from PyTorch:
#
# .. code-block:: python
#
# mod = from_exported_program(exported_program, keep_params_as_input=False)
# # Parameters are now embedded as constants in the module
# executable = tvm.compile(built_mod, target=TARGET)
# # Runtime: vm["main"](input) # No need to pass params!
#
# This creates a single-file deployment (only the ``.so`` is needed), but you
# lose the flexibility to swap parameters without recompiling. For most
# production workflows, separating code and parameters (as shown above) is
# preferred for flexibility.
######################################################################
# Loading and Running the Exported Model
# -----------------------------------------------------------
# To use the exported model on another machine or in a standalone script, you need
# to load both the ``.so`` library and the parameters file. Here's a complete example
# of how to reload and run the model. Save this as ``run_mlp.py``:
#
# To make it executable from the command line:
#
# .. code-block:: bash
#
# chmod +x run_mlp.py
# ./run_mlp.py # Run it like a regular program
#
# Complete script:
#
# .. code-block:: python
#
# #!/usr/bin/env python3
# import numpy as np
# import tvm
# from tvm import relax
#
# # Step 1: Load the compiled library
# lib = tvm.runtime.load_module("relax_export_artifacts/mlp_cpu.so")
#
# # Step 2: Create Virtual Machine
# device = tvm.cpu(0)
# vm = relax.VirtualMachine(lib, device)
#
# # Step 3: Load parameters from the .npz file
# params_npz = np.load("relax_export_artifacts/model_params.npz")
# params = [tvm.runtime.tensor(params_npz[f"p_{i}"], device)
# for i in range(len(params_npz))]
#
# # Step 4: Prepare input data
# data = np.random.randn(1, 1, 28, 28).astype("float32")
# input_tensor = tvm.runtime.tensor(data, device)
#
# # Step 5: Run inference (pass input followed by all parameters)
# output = vm["main"](input_tensor, *params)
#
# # Step 6: Extract result (output may be tuple or single Tensor)
# # PyTorch models typically return tuples, ONNX models may return a single Tensor
# if isinstance(output, tvm.ir.Array) and len(output) > 0:
# result_tensor = output[0]
# else:
# result_tensor = output
#
# print("Prediction shape:", result_tensor.shape)
# print("Predicted class:", np.argmax(result_tensor.numpy()))
#
# **Running on GPU:**
# To run on GPU instead of CPU, make the following changes:
#
# 1. **Compile for GPU** (earlier in the tutorial, around line 112):
#
# .. code-block:: python
#
# TARGET = tvm.target.Target("cuda") # Change from "llvm" to "cuda"
#
# 2. **Use GPU device in the script**:
#
# .. code-block:: python
#
# device = tvm.cuda(0) # Use CUDA device instead of CPU
# vm = relax.VirtualMachine(lib, device)
#
# # Load parameters to GPU
# params = [tvm.runtime.tensor(params_npz[f"p_{i}"], device) # Note: device parameter
# for i in range(len(params_npz))]
#
# # Prepare input on GPU
# input_tensor = tvm.runtime.tensor(data, device) # Note: device parameter
#
# The rest of the script remains the same. All tensors (parameters and inputs)
# must be allocated on the same device (GPU) as the compiled model.
#
# **Deployment Checklist:**
# When moving to another host (via RPC or SCP), you must copy **both** files:
# 1. ``mlp_cpu.so`` (or ``mlp_cuda.so`` for GPU) - The compiled model code
# 2. ``model_params.npz`` - The model parameters (serialized as NumPy arrays)
#
# The remote machine needs both files in the same directory. The script above
# assumes they are in ``relax_export_artifacts/`` relative to the script location.
# Adjust the paths as needed for your deployment. For GPU deployment, ensure the
# target machine has compatible CUDA drivers and the model was compiled for the
# same GPU architecture.
######################################################################
# Deploying to Remote Devices
# ---------------------------
# To deploy the exported model to a remote ARM Linux device (e.g., Raspberry Pi),
# you can use TVM's RPC mechanism to cross-compile, upload, and run the model
# remotely. This workflow is useful when:
#
# - The target device has limited resources for compilation
# - You want to fine-tune performance by running on the actual hardware
# - You need to deploy to embedded devices
#
# See :doc:`cross_compilation_and_rpc </how_to/tutorials/cross_compilation_and_rpc>`
# for a comprehensive guide on:
#
# - Setting up TVM runtime on the remote device
# - Starting an RPC server on the device
# - Cross-compiling for ARM targets (e.g., ``llvm -mtriple=aarch64-linux-gnu``)
# - Uploading exported libraries via RPC
# - Running inference remotely
#
# Quick example for ARM deployment workflow:
#
# .. code-block:: python
#
# import tvm.rpc as rpc
# from tvm import relax
#
# # Step 1: Cross-compile for ARM target (on local machine)
# TARGET = tvm.target.Target({"kind": "llvm", "mtriple": "aarch64-linux-gnu"})
# executable = tvm.compile(built_mod, target=TARGET)
# executable.export_library("mlp_arm.so")
#
# # Step 2: Connect to remote device RPC server
# remote = rpc.connect("192.168.1.100", 9090) # Device IP and RPC port
#
# # Step 3: Upload the compiled library and parameters
# remote.upload("mlp_arm.so")
# remote.upload("model_params.npz")
#
# # Step 4: Load and run on remote device
# lib = remote.load_module("mlp_arm.so")
# vm = relax.VirtualMachine(lib, remote.cpu())
# # ... prepare input and params, then run inference
#
# The key difference is using an ARM target triple during compilation and
# uploading files via RPC instead of copying them directly.
######################################################################
# FAQ
# ---
# **Can I run the ``.so`` as a standalone executable (like ``./mlp_cpu.so``)?**
# No. The ``.so`` file is a shared library, not a standalone executable binary.
# You cannot run it directly from the terminal. It must be loaded through a TVM
# runtime program (as shown in the "Loading and Running" section above). The
# ``.so`` bundles VM bytecode and compiled kernels, but still requires the TVM
# runtime to execute.
#
# **Which devices can run the exported library?**
# The target must match the ISA you compiled for (``llvm`` in this example).
# As long as the target triple, runtime ABI, and available devices line up,
# you can move the artifact between machines. For heterogeneous builds (CPU
# plus GPU), ship the extra device libraries as well.
#
# **What about the ``.params`` and ``metadata.json`` files?**
# These auxiliary files are only generated in specific configurations. In this
# tutorial, since we pass parameters at runtime, they are not generated. When
# they do appear, they may be kept alongside the ``.so`` for inspection, but
# the essential content is typically embedded in the shared object itself, so
# deploying the ``.so`` alone is usually sufficient.