docs/how_to/tutorials/bring_your_own_codegen.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 """
 .. _tutorial-bring-your-own-codegen:

 Bring Your Own Codegen: NPU Backend Example
 ===========================================

 This tutorial shows how to integrate a custom hardware backend with TVM's
 BYOC framework, using the bundled example NPU backend (CPU emulation, no
 real hardware required) as the worked example.  You will see the key
 concepts needed to offload operations to a custom accelerator: pattern
 registration, graph partitioning, codegen, and runtime dispatch.

 NPUs are purpose-built accelerators designed around a fixed set of operations
 common in neural network inference, such as matrix multiplication, convolution,
 and activation functions.
 The example backend's runtime is a *stub*: it logs the dispatch decisions an
 NPU would make (memory tier, execution engine, fusion) but performs no real
 computation, so output buffers are uninitialized.  Assertions in this tutorial
 therefore check shapes, not values.  When you replace the runtime with your
 hardware SDK calls, the same flow produces real results.

 **Prerequisites**: Build TVM with ``USE_EXAMPLE_NPU_CODEGEN=ON`` and
 ``USE_EXAMPLE_NPU_RUNTIME=ON``.
 """

 ######################################################################
 # Overview of the BYOC Flow
 # -------------------------
 #
 # The BYOC framework lets you plug a custom backend into TVM's compilation
 # pipeline in four steps:
 #
 # 1. **Register patterns** - describe which sequences of Relax ops the
 #    backend can handle.
 # 2. **Partition the graph** - group matched ops into composite functions.
 # 3. **Run codegen** - lower composite functions to backend-specific
 #    representation (JSON graph for the example NPU).
 # 4. **Execute** - the runtime dispatches composite functions to the
 #    registered backend runtime.

 ######################################################################
 # Step 1: Import the backend to register its patterns
 # ---------------------------------------------------
 #
 # Importing the module is enough to register all supported patterns with
 # TVM's pattern registry.

 import numpy as np

 import tvm
 import tvm.relax.backend.contrib.example_npu  # registers patterns
 from tvm import relax
 from tvm.relax.backend.pattern_registry import get_patterns_with_prefix
 from tvm.relax.transform import FuseOpsByPattern, MergeCompositeFunctions, RunCodegen
 from tvm.script import relax as R

 has_example_npu_codegen = tvm.get_global_func("relax.ext.example_npu", True)
 has_example_npu_runtime = tvm.get_global_func("runtime.ExampleNPUJSONRuntimeCreate", True)
 has_example_npu = has_example_npu_codegen and has_example_npu_runtime

 target = tvm.target.Target("llvm")

 patterns = get_patterns_with_prefix("example_npu")
 print("Registered patterns:", [p.name for p in patterns])

 ######################################################################
 # Step 2: Define a model
 # ----------------------
 #
 # We use a simple MatMul + ReLU module to illustrate the flow.


 @tvm.script.ir_module
 class MatmulReLU:
     @R.function
     def main(
         x: R.Tensor((2, 4), "float32"),
         w: R.Tensor((4, 8), "float32"),
     ) -> R.Tensor((2, 8), "float32"):
         with R.dataflow():
             y = relax.op.matmul(x, w)
             z = relax.op.nn.relu(y)
             R.output(z)
         return z


 ######################################################################
 # Step 3: Partition the graph
 # ---------------------------
 #
 # ``FuseOpsByPattern`` groups ops that match a registered pattern into
 # composite functions, controlled by two flags:
 #
 # - ``bind_constants=False`` keeps weights as function arguments instead
 #   of baking them in, so the host stays in charge of parameter
 #   ownership.
 # - ``annotate_codegen=True`` tags each composite with its backend name
 #   (``example_npu``); without this tag, ``RunCodegen`` has no way to
 #   route the composite to a backend.
 #
 # ``MergeCompositeFunctions`` then consolidates adjacent composites
 # that target the same backend so each group becomes a single external
 # call.  Note that consolidation depends on the patterns themselves: an
 # ``op_a + op_b`` chain only collapses into one composite if a fused
 # pattern (e.g. ``matmul_relu_fused``) was registered for it; otherwise
 # each op stays as its own composite even when both target the same
 # backend.

 mod = MatmulReLU
 mod = FuseOpsByPattern(patterns, bind_constants=False, annotate_codegen=True)(mod)
 mod = MergeCompositeFunctions()(mod)
 print("After partitioning:")
 print(mod)

 ######################################################################
 # Step 4: Run codegen
 # -------------------
 #
 # ``RunCodegen`` lowers each annotated composite function to the backend's
 # serialization format.  For the example NPU this produces a JSON graph
 # that the C++ runtime can execute.
 #
 # Steps 4 and 5 require TVM to be built with ``USE_EXAMPLE_NPU_CODEGEN=ON``
 # and ``USE_EXAMPLE_NPU_RUNTIME=ON``.

 if has_example_npu:
     mod = RunCodegen()(mod)
     print("After codegen:")
     print(mod)

     ######################################################################
     # Step 5: Build and run
     # ---------------------
     #
     # Build the module for the host target, create a virtual machine, and
     # execute the compiled function.

     np.random.seed(0)
     x_np = np.random.randn(2, 4).astype("float32")
     w_np = np.random.randn(4, 8).astype("float32")

     with tvm.transform.PassContext(opt_level=3):
         built = relax.build(mod, target)

     vm = relax.VirtualMachine(built, tvm.cpu())
     result = vm["main"](tvm.runtime.tensor(x_np, tvm.cpu()), tvm.runtime.tensor(w_np, tvm.cpu()))

     assert result.numpy().shape == (2, 8)
     print("Execution completed. Output shape:", result.numpy().shape)

 ######################################################################
 # Step 6: Conv2D + ReLU
 # ---------------------
 #
 # The same flow applies to convolution workloads.  Because the fused
 # ``conv2d + relu`` pattern is registered after the standalone
 # ``conv2d`` pattern in ``patterns.py`` (later entries have higher
 # priority), both ops are offloaded as a single composite function.


 @tvm.script.ir_module
 class Conv2dReLU:
     @R.function
     def main(
         x: R.Tensor((1, 3, 32, 32), "float32"),
         w: R.Tensor((16, 3, 3, 3), "float32"),
     ) -> R.Tensor((1, 16, 30, 30), "float32"):
         with R.dataflow():
             y = relax.op.nn.conv2d(x, w)
             z = relax.op.nn.relu(y)
             R.output(z)
         return z


 if has_example_npu:
     mod2 = Conv2dReLU
     mod2 = FuseOpsByPattern(patterns, bind_constants=False, annotate_codegen=True)(mod2)
     mod2 = MergeCompositeFunctions()(mod2)
     mod2 = RunCodegen()(mod2)

     with tvm.transform.PassContext(opt_level=3):
         built2 = relax.build(mod2, target)

     x2_np = np.random.randn(1, 3, 32, 32).astype("float32")
     w2_np = np.random.randn(16, 3, 3, 3).astype("float32")

     vm2 = relax.VirtualMachine(built2, tvm.cpu())
     result2 = vm2["main"](
         tvm.runtime.tensor(x2_np, tvm.cpu()), tvm.runtime.tensor(w2_np, tvm.cpu())
     )
     assert result2.numpy().shape == (1, 16, 30, 30)
     print("Conv2dReLU output shape:", result2.numpy().shape)

 ######################################################################
 # Next steps
 # ----------
 #
 # To build a real NPU backend using this example as a starting point:
 #
 # - Replace ``example_npu_runtime.cc`` with your hardware SDK calls.
 # - Extend ``patterns.py`` with the ops your hardware supports.
 # - Add a C++ codegen under ``src/relax/backend/contrib/`` if your
 #   hardware requires a non-JSON serialization format.
 # - Add your cmake module under ``cmake/modules/contrib/`` following
 #   the pattern in ``cmake/modules/contrib/ExampleNPU.cmake``.
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	"""
	.. _tutorial-bring-your-own-codegen:

	Bring Your Own Codegen: NPU Backend Example
	===========================================

	This tutorial shows how to integrate a custom hardware backend with TVM's
	BYOC framework, using the bundled example NPU backend (CPU emulation, no
	real hardware required) as the worked example. You will see the key
	concepts needed to offload operations to a custom accelerator: pattern
	registration, graph partitioning, codegen, and runtime dispatch.

	NPUs are purpose-built accelerators designed around a fixed set of operations
	common in neural network inference, such as matrix multiplication, convolution,
	and activation functions.
	The example backend's runtime is a stub: it logs the dispatch decisions an
	NPU would make (memory tier, execution engine, fusion) but performs no real
	computation, so output buffers are uninitialized. Assertions in this tutorial
	therefore check shapes, not values. When you replace the runtime with your
	hardware SDK calls, the same flow produces real results.

	Prerequisites: Build TVM with ``USE_EXAMPLE_NPU_CODEGEN=ON`` and
	``USE_EXAMPLE_NPU_RUNTIME=ON``.
	"""

	######################################################################
	# Overview of the BYOC Flow
	# -------------------------
	#
	# The BYOC framework lets you plug a custom backend into TVM's compilation
	# pipeline in four steps:
	#
	# 1. Register patterns - describe which sequences of Relax ops the
	# backend can handle.
	# 2. Partition the graph - group matched ops into composite functions.
	# 3. Run codegen - lower composite functions to backend-specific
	# representation (JSON graph for the example NPU).
	# 4. Execute - the runtime dispatches composite functions to the
	# registered backend runtime.

	######################################################################
	# Step 1: Import the backend to register its patterns
	# ---------------------------------------------------
	#
	# Importing the module is enough to register all supported patterns with
	# TVM's pattern registry.

	import numpy as np

	import tvm
	import tvm.relax.backend.contrib.example_npu # registers patterns
	from tvm import relax
	from tvm.relax.backend.pattern_registry import get_patterns_with_prefix
	from tvm.relax.transform import FuseOpsByPattern, MergeCompositeFunctions, RunCodegen
	from tvm.script import relax as R

	has_example_npu_codegen = tvm.get_global_func("relax.ext.example_npu", True)
	has_example_npu_runtime = tvm.get_global_func("runtime.ExampleNPUJSONRuntimeCreate", True)
	has_example_npu = has_example_npu_codegen and has_example_npu_runtime

	target = tvm.target.Target("llvm")

	patterns = get_patterns_with_prefix("example_npu")
	print("Registered patterns:", [p.name for p in patterns])

	######################################################################
	# Step 2: Define a model
	# ----------------------
	#
	# We use a simple MatMul + ReLU module to illustrate the flow.


	@tvm.script.ir_module
	class MatmulReLU:
	@R.function
	def main(
	x: R.Tensor((2, 4), "float32"),
	w: R.Tensor((4, 8), "float32"),
	) -> R.Tensor((2, 8), "float32"):
	with R.dataflow():
	y = relax.op.matmul(x, w)
	z = relax.op.nn.relu(y)
	R.output(z)
	return z


	######################################################################
	# Step 3: Partition the graph
	# ---------------------------
	#
	# ``FuseOpsByPattern`` groups ops that match a registered pattern into
	# composite functions, controlled by two flags:
	#
	# - ``bind_constants=False`` keeps weights as function arguments instead
	# of baking them in, so the host stays in charge of parameter
	# ownership.
	# - ``annotate_codegen=True`` tags each composite with its backend name
	# (``example_npu``); without this tag, ``RunCodegen`` has no way to
	# route the composite to a backend.
	#
	# ``MergeCompositeFunctions`` then consolidates adjacent composites
	# that target the same backend so each group becomes a single external
	# call. Note that consolidation depends on the patterns themselves: an
	# ``op_a + op_b`` chain only collapses into one composite if a fused
	# pattern (e.g. ``matmul_relu_fused``) was registered for it; otherwise
	# each op stays as its own composite even when both target the same
	# backend.

	mod = MatmulReLU
	mod = FuseOpsByPattern(patterns, bind_constants=False, annotate_codegen=True)(mod)
	mod = MergeCompositeFunctions()(mod)
	print("After partitioning:")
	print(mod)

	######################################################################
	# Step 4: Run codegen
	# -------------------
	#
	# ``RunCodegen`` lowers each annotated composite function to the backend's
	# serialization format. For the example NPU this produces a JSON graph
	# that the C++ runtime can execute.
	#
	# Steps 4 and 5 require TVM to be built with ``USE_EXAMPLE_NPU_CODEGEN=ON``
	# and ``USE_EXAMPLE_NPU_RUNTIME=ON``.

	if has_example_npu:
	mod = RunCodegen()(mod)
	print("After codegen:")
	print(mod)

	######################################################################
	# Step 5: Build and run
	# ---------------------
	#
	# Build the module for the host target, create a virtual machine, and
	# execute the compiled function.

	np.random.seed(0)
	x_np = np.random.randn(2, 4).astype("float32")
	w_np = np.random.randn(4, 8).astype("float32")

	with tvm.transform.PassContext(opt_level=3):
	built = relax.build(mod, target)

	vm = relax.VirtualMachine(built, tvm.cpu())
	result = vm["main"](tvm.runtime.tensor(x_np, tvm.cpu()), tvm.runtime.tensor(w_np, tvm.cpu()))

	assert result.numpy().shape == (2, 8)
	print("Execution completed. Output shape:", result.numpy().shape)

	######################################################################
	# Step 6: Conv2D + ReLU
	# ---------------------
	#
	# The same flow applies to convolution workloads. Because the fused
	# ``conv2d + relu`` pattern is registered after the standalone
	# ``conv2d`` pattern in ``patterns.py`` (later entries have higher
	# priority), both ops are offloaded as a single composite function.


	@tvm.script.ir_module
	class Conv2dReLU:
	@R.function
	def main(
	x: R.Tensor((1, 3, 32, 32), "float32"),
	w: R.Tensor((16, 3, 3, 3), "float32"),
	) -> R.Tensor((1, 16, 30, 30), "float32"):
	with R.dataflow():
	y = relax.op.nn.conv2d(x, w)
	z = relax.op.nn.relu(y)
	R.output(z)
	return z


	if has_example_npu:
	mod2 = Conv2dReLU
	mod2 = FuseOpsByPattern(patterns, bind_constants=False, annotate_codegen=True)(mod2)
	mod2 = MergeCompositeFunctions()(mod2)
	mod2 = RunCodegen()(mod2)

	with tvm.transform.PassContext(opt_level=3):
	built2 = relax.build(mod2, target)

	x2_np = np.random.randn(1, 3, 32, 32).astype("float32")
	w2_np = np.random.randn(16, 3, 3, 3).astype("float32")

	vm2 = relax.VirtualMachine(built2, tvm.cpu())
	result2 = vm2["main"](
	tvm.runtime.tensor(x2_np, tvm.cpu()), tvm.runtime.tensor(w2_np, tvm.cpu())
	)
	assert result2.numpy().shape == (1, 16, 30, 30)
	print("Conv2dReLU output shape:", result2.numpy().shape)

	######################################################################
	# Next steps
	# ----------
	#
	# To build a real NPU backend using this example as a starting point:
	#
	# - Replace ``example_npu_runtime.cc`` with your hardware SDK calls.
	# - Extend ``patterns.py`` with the ops your hardware supports.
	# - Add a C++ codegen under ``src/relax/backend/contrib/`` if your
	# hardware requires a non-JSON serialization format.
	# - Add your cmake module under ``cmake/modules/contrib/`` following
	# the pattern in ``cmake/modules/contrib/ExampleNPU.cmake``.