| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| """ |
| Tuning High Performance Convolution on NVIDIA GPUs |
| ========================================================================= |
| **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_ |
| |
| This is an advanced tutorial for writing high performance tunable template for |
| NVIDIA GPU. By running auto-tuner on this template, we can outperform the |
| vendor provided library CuDNN in many cases. |
| """ |
| |
| ###################################################################### |
| # Install dependencies |
| # -------------------- |
| # To use autotvm package in tvm, we need to install some extra dependencies. |
| # (change "3" to "2" if you use python2): |
| # |
| # .. code-block:: bash |
| # |
| # pip3 install --user psutil xgboost tornado |
| # |
| # To make TVM run faster in tuning, it is recommended to use cython |
| # as FFI of tvm. In the root directory of tvm, execute |
| # |
| # .. code-block:: bash |
| # |
| # pip3 install --user cython |
| # sudo make cython3 |
| # |
| # Now return to python code. Import packages. |
| |
| import logging |
| import sys |
| import numpy as np |
| |
| import tvm |
| from tvm import te |
| from tvm import topi |
| from tvm.topi.testing import conv2d_nchw_python |
| |
| from tvm import autotvm |
| |
| ###################################################################### |
| # Step 1: Define the search space |
| # -------------------------------- |
| # There are plenty of useful schedule primitives in tvm. You can also find |
| # some tutorials that describe them in more details, such as |
| # (1). :ref:`opt-conv-gpu` |
| # (2). `Optimizing DepthwiseConv on NVIDIA GPU <https://tvm.apache.org/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example>`_ |
| # |
| # However, their implementations are manually tuned for some special input |
| # shapes. In this section, we build a large enough space to cover |
| # the techniques used in these tutorials. Then we rely on the efficient auto-tuner |
| # to search through this space and pick some good configurations. |
| # |
| # If you are familiar with writing cuda schedule, you can find the following |
| # template is very general. Actually this template can be easily modified |
| # to tune other operators such as depthwise convolution and gemm. |
| # In order to fully understand this template, you should be familiar with |
| # the schedule primitives and auto tuning API. You can refer to the above |
| # tutorials and :doc:`autotvm tutorial <tune_simple_template>` |
| # |
| # It is worth noting that the search space for a conv2d operator |
| # can be very large (at the level of 10^9 for some input shapes) |
| # |
| |
| |
| @autotvm.template("tutorial/conv2d_no_batching") |
| def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding): |
| assert N == 1, "Only consider batch_size = 1 in this template" |
| |
| data = te.placeholder((N, CI, H, W), name="data") |
| kernel = te.placeholder((CO, CI, KH, KW), name="kernel") |
| conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype="float32") |
| s = te.create_schedule([conv.op]) |
| |
| ##### space definition begin ##### |
| n, f, y, x = s[conv].op.axis |
| rc, ry, rx = s[conv].op.reduce_axis |
| |
| cfg = autotvm.get_config() |
| cfg.define_split("tile_f", f, num_outputs=4) |
| cfg.define_split("tile_y", y, num_outputs=4) |
| cfg.define_split("tile_x", x, num_outputs=4) |
| cfg.define_split("tile_rc", rc, num_outputs=3) |
| cfg.define_split("tile_ry", ry, num_outputs=3) |
| cfg.define_split("tile_rx", rx, num_outputs=3) |
| cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) |
| cfg.define_knob("unroll_explicit", [0, 1]) |
| ##### space definition end ##### |
| |
| # inline padding |
| pad_data = s[conv].op.input_tensors[0] |
| s[pad_data].compute_inline() |
| data, raw_data = pad_data, data |
| |
| output = conv |
| OL = s.cache_write(conv, "local") |
| |
| # create cache stage |
| AA = s.cache_read(data, "shared", [OL]) |
| WW = s.cache_read(kernel, "shared", [OL]) |
| AL = s.cache_read(AA, "local", [OL]) |
| WL = s.cache_read(WW, "local", [OL]) |
| |
| # tile and bind spatial axes |
| n, f, y, x = s[output].op.axis |
| bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) |
| by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) |
| bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) |
| kernel_scope = n # this is the scope to attach global config inside this kernel |
| |
| s[output].bind(bf, te.thread_axis("blockIdx.z")) |
| s[output].bind(by, te.thread_axis("blockIdx.y")) |
| s[output].bind(bx, te.thread_axis("blockIdx.x")) |
| s[output].bind(vf, te.thread_axis("vthread")) |
| s[output].bind(vy, te.thread_axis("vthread")) |
| s[output].bind(vx, te.thread_axis("vthread")) |
| s[output].bind(tf, te.thread_axis("threadIdx.z")) |
| s[output].bind(ty, te.thread_axis("threadIdx.y")) |
| s[output].bind(tx, te.thread_axis("threadIdx.x")) |
| s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi) |
| s[OL].compute_at(s[output], tx) |
| |
| # tile reduction axes |
| n, f, y, x = s[OL].op.axis |
| rc, ry, rx = s[OL].op.reduce_axis |
| rco, rcm, rci = cfg["tile_rc"].apply(s, OL, rc) |
| ryo, rym, ryi = cfg["tile_rx"].apply(s, OL, ry) |
| rxo, rxm, rxi = cfg["tile_ry"].apply(s, OL, rx) |
| s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x) |
| |
| s[AA].compute_at(s[OL], rxo) |
| s[WW].compute_at(s[OL], rxo) |
| s[AL].compute_at(s[OL], rxm) |
| s[WL].compute_at(s[OL], rxm) |
| |
| # cooperative fetching |
| for load in [AA, WW]: |
| n, f, y, x = s[load].op.axis |
| fused = s[load].fuse(n, f, y, x) |
| tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2]) |
| ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2]) |
| tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2]) |
| s[load].bind(tz, te.thread_axis("threadIdx.z")) |
| s[load].bind(ty, te.thread_axis("threadIdx.y")) |
| s[load].bind(tx, te.thread_axis("threadIdx.x")) |
| |
| # tune unroll |
| s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) |
| s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val) |
| |
| return s, [raw_data, kernel, conv] |
| |
| |
| ###################################################################### |
| # Step 2: Search through the space |
| # --------------------------------- |
| # We pick the last layer on resnet as test case. |
| # Since our space is very large, :code:`XGBoostTuner` is most suitable |
| # for our case. Here we only do 20 trials for demonstration. |
| # In practice, making 1000 trials usually can find some good kernels |
| # for this template |
| |
| # logging config (for printing tuning log to screen) |
| logging.getLogger("autotvm").setLevel(logging.DEBUG) |
| logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout)) |
| |
| # the last layer in resnet |
| N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1) |
| task = autotvm.task.create( |
| "tutorial/conv2d_no_batching", args=(N, H, W, CO, CI, KH, KW, strides, padding), target="cuda" |
| ) |
| print(task.config_space) |
| |
| # Use local gpu, measure 10 times for every config to reduce variance |
| # The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds |
| measure_option = autotvm.measure_option( |
| builder=autotvm.LocalBuilder(), |
| runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4), |
| ) |
| |
| # Begin tuning, log records to file `conv2d.log` |
| # During tuning we will also try many invalid configs, so you are expected to |
| # see many error reports. As long as you can see non-zero GFLOPS, it is okay. |
| tuner = autotvm.tuner.XGBTuner(task) |
| tuner.tune( |
| n_trial=20, |
| measure_option=measure_option, |
| callbacks=[autotvm.callback.log_to_file("conv2d.log")], |
| ) |
| |
| ######################################################################### |
| # Finally we can inspect the best config from log file, check correctness, |
| # and measure running time. |
| |
| # inspect the best config |
| dispatch_context = autotvm.apply_history_best("conv2d.log") |
| best_config = dispatch_context.query(task.target, task.workload) |
| print("\nBest config:") |
| print(best_config) |
| |
| # apply history best from log file |
| with autotvm.apply_history_best("conv2d.log"): |
| with tvm.target.Target("cuda"): |
| s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding) |
| func = tvm.build(s, arg_bufs) |
| |
| # check correctness |
| a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) |
| w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) |
| c_np = conv2d_nchw_python(a_np, w_np, strides, padding) |
| |
| ctx = tvm.gpu() |
| a_tvm = tvm.nd.array(a_np, ctx=ctx) |
| w_tvm = tvm.nd.array(w_np, ctx=ctx) |
| c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx) |
| func(a_tvm, w_tvm, c_tvm) |
| |
| tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) |
| |
| # Evaluate running time. Here we choose a large repeat number (400) to reduce the noise |
| # and the overhead of kernel launch. You can also use nvprof to validate the result. |
| evaluator = func.time_evaluator(func.entry_name, ctx, number=400) |
| print("Time cost of this operator: %f" % evaluator(a_tvm, w_tvm, c_tvm).mean) |