| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| """ |
| .. _tutorial-tensor-expr-get-started: |
| |
| Get Started with Tensor Expression |
| ================================== |
| **Author**: `Tianqi Chen <https://tqchen.github.io>`_ |
| |
| This is an introductory tutorial to the Tensor expression language in TVM. |
| TVM uses a domain specific tensor expression for efficient kernel construction. |
| |
| In this tutorial, we will demonstrate the basic workflow to use |
| the tensor expression language. |
| """ |
| from __future__ import absolute_import, print_function |
| |
| import tvm |
| import tvm.testing |
| from tvm import te |
| import numpy as np |
| |
| # Global declarations of environment. |
| |
| tgt_host = "llvm" |
| # Change it to respective GPU if gpu is enabled Ex: cuda, opencl, rocm |
| tgt = "cuda" |
| |
| ###################################################################### |
| # Vector Add Example |
| # ------------------ |
| # In this tutorial, we will use a vector addition example to demonstrate |
| # the workflow. |
| # |
| |
| ###################################################################### |
| # Describe the Computation |
| # ------------------------ |
| # As a first step, we need to describe our computation. |
| # TVM adopts tensor semantics, with each intermediate result |
| # represented as a multi-dimensional array. The user needs to describe |
| # the computation rule that generates the tensors. |
| # |
| # We first define a symbolic variable n to represent the shape. |
| # We then define two placeholder Tensors, A and B, with given shape (n,) |
| # |
| # We then describe the result tensor C, with a compute operation. The |
| # compute function takes the shape of the tensor, as well as a lambda |
| # function that describes the computation rule for each position of |
| # the tensor. |
| # |
| # No computation happens during this phase, as we are only declaring how |
| # the computation should be done. |
| # |
| n = te.var("n") |
| A = te.placeholder((n,), name="A") |
| B = te.placeholder((n,), name="B") |
| C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") |
| print(type(C)) |
| |
| ###################################################################### |
| # Schedule the Computation |
| # ------------------------ |
| # While the above lines describe the computation rule, we can compute |
| # C in many ways since the axis of C can be computed in a data |
| # parallel manner. TVM asks the user to provide a description of the |
| # computation called a schedule. |
| # |
| # A schedule is a set of transformation of computation that transforms |
| # the loop of computations in the program. |
| # |
| # After we construct the schedule, by default the schedule computes |
| # C in a serial manner in a row-major order. |
| # |
| # .. code-block:: c |
| # |
| # for (int i = 0; i < n; ++i) { |
| # C[i] = A[i] + B[i]; |
| # } |
| # |
| s = te.create_schedule(C.op) |
| |
| ###################################################################### |
| # We used the split construct to split the first axis of C, |
| # this will split the original iteration axis into product of |
| # two iterations. This is equivalent to the following code. |
| # |
| # .. code-block:: c |
| # |
| # for (int bx = 0; bx < ceil(n / 64); ++bx) { |
| # for (int tx = 0; tx < 64; ++tx) { |
| # int i = bx * 64 + tx; |
| # if (i < n) { |
| # C[i] = A[i] + B[i]; |
| # } |
| # } |
| # } |
| # |
| bx, tx = s[C].split(C.op.axis[0], factor=64) |
| |
| ###################################################################### |
| # Finally we bind the iteration axis bx and tx to threads in the GPU |
| # compute grid. These are GPU specific constructs that allow us |
| # to generate code that runs on GPU. |
| # |
| if tgt == "cuda" or tgt == "rocm" or tgt.startswith("opencl"): |
| s[C].bind(bx, te.thread_axis("blockIdx.x")) |
| s[C].bind(tx, te.thread_axis("threadIdx.x")) |
| |
| ###################################################################### |
| # Compilation |
| # ----------- |
| # After we have finished specifying the schedule, we can compile it |
| # into a TVM function. By default TVM compiles into a type-erased |
| # function that can be directly called from the python side. |
| # |
| # In the following line, we use tvm.build to create a function. |
| # The build function takes the schedule, the desired signature of the |
| # function (including the inputs and outputs) as well as target language |
| # we want to compile to. |
| # |
| # The result of compilation fadd is a GPU device function (if GPU is |
| # involved) as well as a host wrapper that calls into the GPU |
| # function. fadd is the generated host wrapper function, it contains |
| # a reference to the generated device function internally. |
| # |
| fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd") |
| |
| ###################################################################### |
| # Run the Function |
| # ---------------- |
| # The compiled TVM function is exposes a concise C API |
| # that can be invoked from any language. |
| # |
| # We provide a minimal array API in python to aid quick testing and prototyping. |
| # The array API is based on the `DLPack <https://github.com/dmlc/dlpack>`_ standard. |
| # |
| # - We first create a GPU context. |
| # - Then tvm.nd.array copies the data to the GPU. |
| # - fadd runs the actual computation. |
| # - asnumpy() copies the GPU array back to the CPU and we can use this to verify correctness |
| # |
| ctx = tvm.context(tgt, 0) |
| |
| n = 1024 |
| a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) |
| b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) |
| c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) |
| fadd(a, b, c) |
| tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) |
| |
| ###################################################################### |
| # Inspect the Generated Code |
| # -------------------------- |
| # You can inspect the generated code in TVM. The result of tvm.build |
| # is a TVM Module. fadd is the host module that contains the host wrapper, |
| # it also contains a device module for the CUDA (GPU) function. |
| # |
| # The following code fetches the device module and prints the content code. |
| # |
| if tgt == "cuda" or tgt == "rocm" or tgt.startswith("opencl"): |
| dev_module = fadd.imported_modules[0] |
| print("-----GPU code-----") |
| print(dev_module.get_source()) |
| else: |
| print(fadd.get_source()) |
| |
| ###################################################################### |
| # .. note:: Code Specialization |
| # |
| # As you may have noticed, the declarations of A, B and C all |
| # take the same shape argument, n. TVM will take advantage of this |
| # to pass only a single shape argument to the kernel, as you will find in |
| # the printed device code. This is one form of specialization. |
| # |
| # On the host side, TVM will automatically generate check code |
| # that checks the constraints in the parameters. So if you pass |
| # arrays with different shapes into fadd, an error will be raised. |
| # |
| # We can do more specializations. For example, we can write |
| # :code:`n = tvm.runtime.convert(1024)` instead of :code:`n = te.var("n")`, |
| # in the computation declaration. The generated function will |
| # only take vectors with length 1024. |
| # |
| |
| ###################################################################### |
| # Save Compiled Module |
| # -------------------- |
| # Besides runtime compilation, we can save the compiled modules into |
| # a file and load them back later. This is called ahead of time compilation. |
| # |
| # The following code first performs the following steps: |
| # |
| # - It saves the compiled host module into an object file. |
| # - Then it saves the device module into a ptx file. |
| # - cc.create_shared calls a compiler (gcc) to create a shared library |
| # |
| from tvm.contrib import cc |
| from tvm.contrib import util |
| |
| temp = util.tempdir() |
| fadd.save(temp.relpath("myadd.o")) |
| if tgt == "cuda": |
| fadd.imported_modules[0].save(temp.relpath("myadd.ptx")) |
| if tgt == "rocm": |
| fadd.imported_modules[0].save(temp.relpath("myadd.hsaco")) |
| if tgt.startswith("opencl"): |
| fadd.imported_modules[0].save(temp.relpath("myadd.cl")) |
| cc.create_shared(temp.relpath("myadd.so"), [temp.relpath("myadd.o")]) |
| print(temp.listdir()) |
| |
| ###################################################################### |
| # .. note:: Module Storage Format |
| # |
| # The CPU (host) module is directly saved as a shared library (.so). |
| # There can be multiple customized formats of the device code. |
| # In our example, the device code is stored in ptx, as well as a meta |
| # data json file. They can be loaded and linked separately via import. |
| # |
| |
| ###################################################################### |
| # Load Compiled Module |
| # -------------------- |
| # We can load the compiled module from the file system and run the code. |
| # The following code loads the host and device module separately and |
| # re-links them together. We can verify that the newly loaded function works. |
| # |
| fadd1 = tvm.runtime.load_module(temp.relpath("myadd.so")) |
| if tgt == "cuda": |
| fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.ptx")) |
| fadd1.import_module(fadd1_dev) |
| |
| if tgt == "rocm": |
| fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.hsaco")) |
| fadd1.import_module(fadd1_dev) |
| |
| if tgt.startswith("opencl"): |
| fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.cl")) |
| fadd1.import_module(fadd1_dev) |
| |
| fadd1(a, b, c) |
| tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) |
| |
| ###################################################################### |
| # Pack Everything into One Library |
| # -------------------------------- |
| # In the above example, we store the device and host code separately. |
| # TVM also supports export everything as one shared library. |
| # Under the hood, we pack the device modules into binary blobs and link |
| # them together with the host code. |
| # Currently we support packing of Metal, OpenCL and CUDA modules. |
| # |
| fadd.export_library(temp.relpath("myadd_pack.so")) |
| fadd2 = tvm.runtime.load_module(temp.relpath("myadd_pack.so")) |
| fadd2(a, b, c) |
| tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) |
| |
| ###################################################################### |
| # .. note:: Runtime API and Thread-Safety |
| # |
| # The compiled modules of TVM do not depend on the TVM compiler. |
| # Instead, they only depend on a minimum runtime library. |
| # The TVM runtime library wraps the device drivers and provides |
| # thread-safe and device agnostic calls into the compiled functions. |
| # |
| # This means that you can call the compiled TVM functions from any thread, |
| # on any GPUs. |
| # |
| |
| ###################################################################### |
| # Generate OpenCL Code |
| # -------------------- |
| # TVM provides code generation features into multiple backends, |
| # we can also generate OpenCL code or LLVM code that runs on CPU backends. |
| # |
| # The following code blocks generate OpenCL code, creates array on an OpenCL |
| # device, and verifies the correctness of the code. |
| # |
| if tgt.startswith("opencl"): |
| fadd_cl = tvm.build(s, [A, B, C], tgt, name="myadd") |
| print("------opencl code------") |
| print(fadd_cl.imported_modules[0].get_source()) |
| ctx = tvm.cl(0) |
| n = 1024 |
| a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) |
| b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) |
| c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) |
| fadd_cl(a, b, c) |
| tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy()) |
| |
| ###################################################################### |
| # Summary |
| # ------- |
| # This tutorial provides a walk through of TVM workflow using |
| # a vector add example. The general workflow is |
| # |
| # - Describe your computation via a series of operations. |
| # - Describe how we want to compute use schedule primitives. |
| # - Compile to the target function we want. |
| # - Optionally, save the function to be loaded later. |
| # |
| # You are more than welcome to checkout other examples and |
| # tutorials to learn more about the supported operations, scheduling primitives |
| # and other features in TVM. |
| # |