| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| """ |
| .. _tir_blitz: |
| |
| Blitz Course to TensorIR |
| ======================== |
| **Author**: `Siyuan Feng <https://github.com/Hzfengsy>`_ |
| |
| TensorIR is a domain specific language for deep learning programs serving two broad purposes: |
| |
| - An implementation for transforming and optimizing programs on various hardware backends. |
| |
| - An abstraction for automatic _tensorized_ program optimization. |
| |
| """ |
| |
| import tvm |
| from tvm.ir.module import IRModule |
| from tvm.script import tir as T |
| import numpy as np |
| |
| ################################################################################################ |
| # IRModule |
| # -------- |
| # An IRModule is the central data structure in TVM, which contains deep learning programs. |
| # It is the basic object of interest of IR transformation and model building. |
| # |
| # .. image:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_life_of_irmodule.png |
| # :align: center |
| # :width: 85% |
| # |
| # This is the life cycle of an IRModule, which can be created from TVMScript. TensorIR schedule |
| # primitives and passes are two major ways to transform an IRModule. Also, a sequence of |
| # transformations on an IRModule is acceptable. Note that we can print an IRModule at **ANY** stage |
| # to TVMScript. After all transformations and optimizations are complete, we can build the IRModule |
| # to a runnable module to deploy on target devices. |
| # |
| # Based on the design of TensorIR and IRModule, we are able to create a new programming method: |
| # |
| # 1. Write a program by TVMScript in a python-AST based syntax. |
| # |
| # 2. Transform and optimize a program with python api. |
| # |
| # 3. Interactively inspect and try the performance with an imperative style transformation API. |
| |
| |
| ################################################################################################ |
| # Create an IRModule |
| # ------------------ |
| # IRModule can be created by writing TVMScript, which is a round-trippable syntax for TVM IR. |
| # |
| # Different than creating a computational expression by Tensor Expression |
| # (:ref:`tutorial-tensor-expr-get-started`), TensorIR allow users to program through TVMScript, |
| # a language embedded in python AST. The new method makes it possible to write complex programs |
| # and further schedule and optimize it. |
| # |
| # Following is a simple example for vector addition. |
| # |
| |
| |
| @tvm.script.ir_module |
| class MyModule: |
| @T.prim_func |
| def main(a: T.handle, b: T.handle): |
| # We exchange data between function by handles, which are similar to pointer. |
| T.func_attr({"global_symbol": "main", "tir.noalias": True}) |
| # Create buffer from handles. |
| A = T.match_buffer(a, (8,), dtype="float32") |
| B = T.match_buffer(b, (8,), dtype="float32") |
| for i in range(8): |
| # A block is an abstraction for computation. |
| with T.block("B"): |
| # Define a spatial block iterator and bind it to value i. |
| vi = T.axis.spatial(8, i) |
| B[vi] = A[vi] + 1.0 |
| |
| |
| ir_module = MyModule |
| print(type(ir_module)) |
| print(ir_module.script()) |
| |
| ################################################################################################ |
| # Besides, we can also use tensor expression DSL to write simple operators, and convert them |
| # to an IRModule. |
| # |
| |
| from tvm import te |
| |
| A = te.placeholder((8,), dtype="float32", name="A") |
| B = te.compute((8,), lambda *i: A(*i) + 1.0, name="B") |
| func = te.create_prim_func([A, B]) |
| ir_module_from_te = IRModule({"main": func}) |
| print(ir_module_from_te.script()) |
| |
| |
| ################################################################################################ |
| # Build and Run an IRModule |
| # ------------------------- |
| # We can build the IRModule into a runnable module with specific target backends. |
| # |
| |
| mod = tvm.build(ir_module, target="llvm") # The module for CPU backends. |
| print(type(mod)) |
| |
| ################################################################################################ |
| # Prepare the input array and output array, then run the module. |
| # |
| |
| a = tvm.nd.array(np.arange(8).astype("float32")) |
| b = tvm.nd.array(np.zeros((8,)).astype("float32")) |
| mod(a, b) |
| print(a) |
| print(b) |
| |
| |
| ################################################################################################ |
| # Transform an IRModule |
| # --------------------- |
| # The IRModule is the central data structure for program optimization, which can be transformed |
| # by :code:`Schedule`. |
| # A schedule contains multiple primitive methods to interactively transform the program. |
| # Each primitive transforms the program in certain ways to bring additional performance optimizations. |
| # |
| # .. image:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_tensor_ir_opt_flow.png |
| # :align: center |
| # :width: 100% |
| # |
| # The image above is a typical workflow for optimizing a tensor program. First, we need to create a |
| # schedule on the initial IRModule created from either TVMScript or Tensor Expression. Then, a |
| # sequence of schedule primitives will help to improve the performance. And at last, we can lower |
| # and build it into a runnable module. |
| # |
| # Here we just demonstrate a very simple transformation. First we create schedule on the input `ir_module`. |
| |
| sch = tvm.tir.Schedule(ir_module) |
| print(type(sch)) |
| |
| ################################################################################################ |
| # Tile the loop into 3 loops and print the result. |
| |
| # Get block by its name |
| block_b = sch.get_block("B") |
| # Get loops surrounding the block |
| (i,) = sch.get_loops(block_b) |
| # Tile the loop nesting. |
| i_0, i_1, i_2 = sch.split(i, factors=[2, 2, 2]) |
| print(sch.mod.script()) |
| |
| |
| ################################################################################################ |
| # We can also reorder the loops. Now we move loop `i_2` to outside of `i_1`. |
| sch.reorder(i_0, i_2, i_1) |
| print(sch.mod.script()) |
| |
| |
| ################################################################################################ |
| # Transform to a GPU program |
| # ~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| # If we want to deploy models on GPUs, thread binding is necessary. Fortunately, we can |
| # also use primitives and do incrementally transformation. |
| # |
| |
| sch.bind(i_0, "blockIdx.x") |
| sch.bind(i_2, "threadIdx.x") |
| print(sch.mod.script()) |
| |
| |
| ################################################################################################ |
| # After binding the threads, now build the IRModule with :code:`cuda` backends. |
| ctx = tvm.cuda(0) |
| cuda_mod = tvm.build(sch.mod, target="cuda") |
| cuda_a = tvm.nd.array(np.arange(8).astype("float32"), ctx) |
| cuda_b = tvm.nd.array(np.zeros((8,)).astype("float32"), ctx) |
| cuda_mod(cuda_a, cuda_b) |
| print(cuda_a) |
| print(cuda_b) |