tutorials/optimize/opt_conv_tensorcore.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 """
 .. _opt-conv-tensorcore:

 How to optimize convolution using TensorCores
 =============================================
 **Author**: `Siyuan Feng <https://github.com/Hzfengsy>`_

 In this tutorial, we will demonstrate how to write a high performance convolution
 schedule using TensorCores in TVM. In this example, we assume the input to
 convolution has a large batch. We strongly recommend covering the :ref:`opt-conv-gpu` tutorial first.

 """

 ################################################################
 # TensorCore Introduction
 # -----------------------
 # Each Tensor Core provides a 4x4x4 matrix processing array that operates
 # :code:`D = A * B + C`, where A, B, C and D are 4x4 matrices as Figure shows.
 # The matrix multiplication inputs A and B are FP16 matrices, while the accumulation
 # matrices C and D may be FP16 or FP32 matrices.
 #
 # However, CUDA programmers can only use warp-level primitive
 # :code:`wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag)` to perform
 # 16x16x16 half-precision matrix multiplication on tensor cores. Before invoking
 # the matrix multiplication, programmers must load data from memory into registers
 # with primitive :code:`wmma::load_matrix_sync`, explicitly. The NVCC compiler translates
 # that primitive into multiple memory load instructions. At run time, every thread loads
 # 16 elements from matrix A and 16 elements from B.

 ################################################################
 # Preparation and Algorithm
 # -------------------------
 # We use the fixed size for input tensors with 256 channels and 14 x 14 dimensions.
 # The batch size is 256. Convolution filters contain 512 filters of size 3 x 3.
 # We use stride size 1 and padding size 1 for the convolution. In the example, we use
 # NHWCnc memory layout.The following code defines the convolution algorithm in TVM.

 import tvm
 from tvm import te
 import numpy as np
 from tvm.contrib import nvcc

 # The sizes of inputs and filters
 batch_size = 256
 height = 14
 width = 14
 in_channels = 256
 out_channels = 512
 kernel_h = 3
 kernel_w = 3
 pad_h = 1
 pad_w = 1
 stride_h = 1
 stride_w = 1

 # TensorCore shape
 block_size = 16

 assert batch_size % block_size == 0
 assert in_channels % block_size == 0
 assert out_channels % block_size == 0

 # Input feature map: (N, H, W, IC, n, ic)
 data_shape = (
     batch_size // block_size,
     height,
     width,
     in_channels // block_size,
     block_size,
     block_size,
 )
 # Kernel: (H, W, IC, OC, ic, oc)
 kernel_shape = (
     kernel_h,
     kernel_w,
     in_channels // block_size,
     out_channels // block_size,
     block_size,
     block_size,
 )
 # Output feature map: (N, H, W, OC, n, oc)
 output_shape = (
     batch_size // block_size,
     height,
     width,
     out_channels // block_size,
     block_size,
     block_size,
 )

 # Reduction axes
 kh = te.reduce_axis((0, kernel_h), name="kh")
 kw = te.reduce_axis((0, kernel_w), name="kw")
 ic = te.reduce_axis((0, in_channels // block_size), name="ic")
 ii = te.reduce_axis((0, block_size), name="ii")

 # Algorithm
 A = te.placeholder(data_shape, name="A", dtype="float16")
 W = te.placeholder(kernel_shape, name="W", dtype="float16")
 Apad = te.compute(
     (
         batch_size // block_size,
         height + 2 * pad_h,
         width + 2 * pad_w,
         in_channels // block_size,
         block_size,
         block_size,
     ),
     lambda n, h, w, i, nn, ii: tvm.tir.if_then_else(
         tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w < width),
         A[n, h - pad_h, w - pad_w, i, nn, ii],
         tvm.tir.const(0.0, "float16"),
     ),
     name="Apad",
 )
 Conv = te.compute(
     output_shape,
     lambda n, h, w, o, nn, oo: te.sum(
         Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32")
         * W[kh, kw, ic, o, ii, oo].astype("float32"),
         axis=[ic, kh, kw, ii],
     ),
     name="Conv",
 )

 s = te.create_schedule(Conv.op)
 s[Apad].compute_inline()

 ###############################################################################
 # Memory Scope
 # ------------
 # In traditional GPU schedule, we have global, shared and local memory scope.
 # To support TensorCores, we add another three special memory scope: :code:`wmma.matrix_a`,
 # :code:`wmma.matrix_b` and :code:`wmma.accumulator`. On hardware, all fragments scope
 # stores at the on-chip registers level, the same place with local memory.

 # Designate the memory hierarchy
 AS = s.cache_read(Apad, "shared", [Conv])
 WS = s.cache_read(W, "shared", [Conv])
 AF = s.cache_read(AS, "wmma.matrix_a", [Conv])
 WF = s.cache_read(WS, "wmma.matrix_b", [Conv])
 ConvF = s.cache_write(Conv, "wmma.accumulator")

 ###############################################################################
 # Define Tensor Intrinsic
 # -----------------------
 # In fact, TensorCore is a special hardware operation. So, we can just use tensorize
 # to replace a unit of computation with the TensorCore instruction. The first thing is
 # that we need to define tensor intrinsic.
 #
 # There are four basic operation in TensorCore: :code:`fill_fragment`, :code:`load_matrix`,
 # :code:`mma_sync` and :code:`store_matrix`. Since :code:`fill_fragment` and :code:`mma_sync`
 # are both used in matrix multiplication, so we can just write following three intrinsics.


 def intrin_wmma_load_matrix(scope):
     n = 16
     A = te.placeholder((n, n), name="A", dtype="float16")
     BA = tvm.tir.decl_buffer(A.shape, A.dtype, scope="shared", data_alignment=32, offset_factor=256)
     C = te.compute((n, n), lambda i, j: A[i, j], name="C")
     BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=256)

     def intrin_func(ins, outs):
         ib = tvm.tir.ir_builder.create()

         BA = ins[0]
         BC = outs[0]
         ib.emit(
             tvm.tir.call_intrin(
                 "handle",
                 "tir.tvm_load_matrix_sync",
                 BC.data,
                 n,
                 n,
                 n,
                 BC.elem_offset // 256,
                 BA.access_ptr("r"),
                 n,
                 "row_major",
             )
         )
         return ib.get()

     return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})


 def intrin_wmma_gemm():
     n = 16
     A = te.placeholder((n, n), name="A", dtype="float16")
     B = te.placeholder((n, n), name="B", dtype="float16")
     k = te.reduce_axis((0, n), name="k")
     C = te.compute(
         (n, n),
         lambda ii, jj: te.sum(A[ii, k].astype("float") * B[k, jj].astype("float"), axis=k),
         name="C",
     )
     BA = tvm.tir.decl_buffer(
         A.shape, A.dtype, name="BA", scope="wmma.matrix_a", data_alignment=32, offset_factor=256
     )
     BB = tvm.tir.decl_buffer(
         B.shape, B.dtype, name="BB", scope="wmma.matrix_b", data_alignment=32, offset_factor=256
     )
     BC = tvm.tir.decl_buffer(
         C.shape, C.dtype, name="BC", scope="wmma.accumulator", data_alignment=32, offset_factor=256
     )

     def intrin_func(ins, outs):
         BA, BB = ins
         (BC,) = outs

         def init():
             ib = tvm.tir.ir_builder.create()
             ib.emit(
                 tvm.tir.call_intrin(
                     "handle", "tir.tvm_fill_fragment", BC.data, n, n, n, BC.elem_offset // 256, 0.0
                 )
             )
             return ib.get()

         def update():
             ib = tvm.tir.ir_builder.create()
             ib.emit(
                 tvm.tir.call_intrin(
                     "handle",
                     "tir.tvm_mma_sync",
                     BC.data,
                     BC.elem_offset // 256,
                     BA.data,
                     BA.elem_offset // 256,
                     BB.data,
                     BB.elem_offset // 256,
                     BC.data,
                     BC.elem_offset // 256,
                 )
             )
             return ib.get()

         return update(), init(), update()

     return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})


 def intrin_wmma_store_matrix():
     n = 16
     A = te.placeholder((n, n), name="A", dtype="float32")
     BA = tvm.tir.decl_buffer(
         A.shape, A.dtype, scope="wmma.accumulator", data_alignment=32, offset_factor=256
     )
     C = te.compute((n, n), lambda i, j: A[i, j], name="C")
     BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope="global", data_alignment=32, offset_factor=256)

     def intrin_func(ins, outs):
         ib = tvm.tir.ir_builder.create()
         BA = ins[0]
         BC = outs[0]
         ib.emit(
             tvm.tir.call_intrin(
                 "handle",
                 "tir.tvm_store_matrix_sync",
                 BA.data,
                 n,
                 n,
                 n,
                 BA.elem_offset // 256,
                 BC.access_ptr("w"),
                 n,
                 "row_major",
             )
         )
         return ib.get()

     return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})


 ###############################################################################
 # Scheduling the Computation
 # --------------------------
 # To use TensorCores in TVM, we must schedule the computation into specific structure
 # to match the tensor intrinsic. The same as traditional GPU programs, we can also use
 # shared memory to boost the speed. If you have any questions about blocking and shared
 # memory, please refer :ref:`opt-conv-gpu`.
 #
 # In this example, each block contains 2x4 warps, and each warp calls 4x2 TensorCore
 # instructions. Thus, the output shape of each warp is 64x32 and each block outputs
 # 128x128 titles. Due to the limit of shared memory space, we only load 2 blocks (2x128x128 tiles)
 # one time.
 #
 # .. note::
 #
 #   *Warp-level Operation*
 #
 #   Note that all TensorCore instructions are warp-level instructions, which means all 32 threads
 #   in a warp should do this instruction simultaneously. Making theadIdx.x extent=32 is one of the
 #   easiest way to solve this. Then We can bind threadIdx.x to any loops except those contain
 #   TensorCore intrinsics directly or indirectly. Also note that it is not the unique solution.
 #   The only thing we should do is to make sure all threads in a warp can call TensorCore at the same time.

 # Define tiling sizes
 block_row_warps = 4
 block_col_warps = 2
 warp_row_tiles = 2
 warp_col_tiles = 4
 warp_size = 32
 chunk = 2

 block_x = te.thread_axis("blockIdx.x")
 block_y = te.thread_axis("blockIdx.y")
 block_z = te.thread_axis("blockIdx.z")
 thread_x = te.thread_axis("threadIdx.x")
 thread_y = te.thread_axis("threadIdx.y")
 thread_z = te.thread_axis("threadIdx.z")

 nc, hc, wc, oc, nnc, ooc = Conv.op.axis
 block_k = s[Conv].fuse(hc, wc)
 s[Conv].bind(block_k, block_z)
 nc, nci = s[Conv].split(nc, factor=warp_row_tiles)
 block_i, nc = s[Conv].split(nc, factor=block_row_warps)
 oc, oci = s[Conv].split(oc, factor=warp_col_tiles)
 block_j, oc = s[Conv].split(oc, factor=block_col_warps)
 s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc)
 s[Conv].bind(block_i, block_x)
 s[Conv].bind(block_j, block_y)
 s[Conv].bind(nc, thread_y)
 s[Conv].bind(oc, thread_z)

 # Schedule local computation
 s[ConvF].compute_at(s[Conv], oc)
 n, h, w, o, nnf, oof = ConvF.op.axis
 ko, ki = s[ConvF].split(ic, factor=chunk)
 s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)

 # Move intermediate computation into each output compute tile
 s[AF].compute_at(s[ConvF], kw)
 s[WF].compute_at(s[ConvF], kw)

 # Schedule for A's share memory
 s[AS].compute_at(s[ConvF], kh)
 n, h, w, i, nn, ii = AS.op.axis
 tx, xo = s[AS].split(n, nparts=block_row_warps)
 ty, yo = s[AS].split(xo, nparts=block_col_warps)
 t = s[AS].fuse(nn, ii)
 to, ti = s[AS].split(t, factor=warp_size)
 s[AS].bind(tx, thread_y)
 s[AS].bind(ty, thread_z)
 s[AS].bind(ti, thread_x)

 # Schedule for W's share memory
 s[WS].compute_at(s[ConvF], kh)
 kh, kw, ic, o, ii, oo = WS.op.axis
 tx, xo = s[WS].split(o, nparts=block_row_warps)
 ty, yo = s[WS].split(xo, nparts=block_col_warps)
 t = s[WS].fuse(ii, oo)
 to, ti = s[WS].split(t, nparts=warp_size)
 s[WS].bind(tx, thread_y)
 s[WS].bind(ty, thread_z)
 s[WS].bind(to, thread_x)
 s[WS].vectorize(ti)
 print(tvm.lower(s, [A, W, Conv], simple_mode=True))

 ###############################################################################
 # Lowering Computation to Intrinsics
 # ----------------------------------
 # The last phase is to lower the computation loops down to TensorCore hardware intrinsics
 # by mapping the 2D convolution to tensor intrinsics

 s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix("wmma.matrix_a"))
 s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix("wmma.matrix_b"))
 s[Conv].tensorize(nnc, intrin_wmma_store_matrix())
 s[ConvF].tensorize(nnf, intrin_wmma_gemm())
 print(tvm.lower(s, [A, W, Conv], simple_mode=True))

 ###############################################################################
 # Generate CUDA Kernel
 # --------------------
 # Finally we use TVM to generate and compile the CUDA kernel, and evaluate the latency of convolution.
 # Since TensorCores are only supported in NVIDIA GPU with Compute Capability 7.0 or higher, it may not
 # be able to run on our build server

 ctx = tvm.gpu(0)
 if nvcc.have_tensorcore(ctx.compute_version):
     with tvm.transform.PassContext(config={"tir.UnrollLoop": {"auto_max_step": 16}}):
         func = tvm.build(s, [A, W, Conv], "cuda")
     a_np = np.random.uniform(size=data_shape).astype(A.dtype)
     w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
     a = tvm.nd.array(a_np, ctx)
     w = tvm.nd.array(w_np, ctx)
     c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx)
     evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
     print("conv2d with tensor core: %f ms" % (evaluator(a, w, c).mean * 1e3))

 ###############################################################################
 # Summary
 # -------
 # This tutorial demonstrates how TVM scheduling primitives can be used to
 # call TensorCores on specific GPUs.
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	"""
	.. _opt-conv-tensorcore:

	How to optimize convolution using TensorCores
	=============================================
	Author: `Siyuan Feng <https://github.com/Hzfengsy>`_

	In this tutorial, we will demonstrate how to write a high performance convolution
	schedule using TensorCores in TVM. In this example, we assume the input to
	convolution has a large batch. We strongly recommend covering the :ref:`opt-conv-gpu` tutorial first.

	"""

	################################################################
	# TensorCore Introduction
	# -----------------------
	# Each Tensor Core provides a 4x4x4 matrix processing array that operates
	# :code:`D = A * B + C`, where A, B, C and D are 4x4 matrices as Figure shows.
	# The matrix multiplication inputs A and B are FP16 matrices, while the accumulation
	# matrices C and D may be FP16 or FP32 matrices.
	#
	# However, CUDA programmers can only use warp-level primitive
	# :code:`wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag)` to perform
	# 16x16x16 half-precision matrix multiplication on tensor cores. Before invoking
	# the matrix multiplication, programmers must load data from memory into registers
	# with primitive :code:`wmma::load_matrix_sync`, explicitly. The NVCC compiler translates
	# that primitive into multiple memory load instructions. At run time, every thread loads
	# 16 elements from matrix A and 16 elements from B.

	################################################################
	# Preparation and Algorithm
	# -------------------------
	# We use the fixed size for input tensors with 256 channels and 14 x 14 dimensions.
	# The batch size is 256. Convolution filters contain 512 filters of size 3 x 3.
	# We use stride size 1 and padding size 1 for the convolution. In the example, we use
	# NHWCnc memory layout.The following code defines the convolution algorithm in TVM.

	import tvm
	from tvm import te
	import numpy as np
	from tvm.contrib import nvcc

	# The sizes of inputs and filters
	batch_size = 256
	height = 14
	width = 14
	in_channels = 256
	out_channels = 512
	kernel_h = 3
	kernel_w = 3
	pad_h = 1
	pad_w = 1
	stride_h = 1
	stride_w = 1

	# TensorCore shape
	block_size = 16

	assert batch_size % block_size == 0
	assert in_channels % block_size == 0
	assert out_channels % block_size == 0

	# Input feature map: (N, H, W, IC, n, ic)
	data_shape = (
	batch_size // block_size,
	height,
	width,
	in_channels // block_size,
	block_size,
	block_size,
	)
	# Kernel: (H, W, IC, OC, ic, oc)
	kernel_shape = (
	kernel_h,
	kernel_w,
	in_channels // block_size,
	out_channels // block_size,
	block_size,
	block_size,
	)
	# Output feature map: (N, H, W, OC, n, oc)
	output_shape = (
	batch_size // block_size,
	height,
	width,
	out_channels // block_size,
	block_size,
	block_size,
	)

	# Reduction axes
	kh = te.reduce_axis((0, kernel_h), name="kh")
	kw = te.reduce_axis((0, kernel_w), name="kw")
	ic = te.reduce_axis((0, in_channels // block_size), name="ic")
	ii = te.reduce_axis((0, block_size), name="ii")

	# Algorithm
	A = te.placeholder(data_shape, name="A", dtype="float16")
	W = te.placeholder(kernel_shape, name="W", dtype="float16")
	Apad = te.compute(
	(
	batch_size // block_size,
	height + 2 * pad_h,
	width + 2 * pad_w,
	in_channels // block_size,
	block_size,
	block_size,
	),
	lambda n, h, w, i, nn, ii: tvm.tir.if_then_else(
	tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w < width),
	A[n, h - pad_h, w - pad_w, i, nn, ii],
	tvm.tir.const(0.0, "float16"),
	),
	name="Apad",
	)
	Conv = te.compute(
	output_shape,
	lambda n, h, w, o, nn, oo: te.sum(
	Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32")
	* W[kh, kw, ic, o, ii, oo].astype("float32"),
	axis=[ic, kh, kw, ii],
	),
	name="Conv",
	)

	s = te.create_schedule(Conv.op)
	s[Apad].compute_inline()

	###############################################################################
	# Memory Scope
	# ------------
	# In traditional GPU schedule, we have global, shared and local memory scope.
	# To support TensorCores, we add another three special memory scope: :code:`wmma.matrix_a`,
	# :code:`wmma.matrix_b` and :code:`wmma.accumulator`. On hardware, all fragments scope
	# stores at the on-chip registers level, the same place with local memory.

	# Designate the memory hierarchy
	AS = s.cache_read(Apad, "shared", [Conv])
	WS = s.cache_read(W, "shared", [Conv])
	AF = s.cache_read(AS, "wmma.matrix_a", [Conv])
	WF = s.cache_read(WS, "wmma.matrix_b", [Conv])
	ConvF = s.cache_write(Conv, "wmma.accumulator")

	###############################################################################
	# Define Tensor Intrinsic
	# -----------------------
	# In fact, TensorCore is a special hardware operation. So, we can just use tensorize
	# to replace a unit of computation with the TensorCore instruction. The first thing is
	# that we need to define tensor intrinsic.
	#
	# There are four basic operation in TensorCore: :code:`fill_fragment`, :code:`load_matrix`,
	# :code:`mma_sync` and :code:`store_matrix`. Since :code:`fill_fragment` and :code:`mma_sync`
	# are both used in matrix multiplication, so we can just write following three intrinsics.


	def intrin_wmma_load_matrix(scope):
	n = 16
	A = te.placeholder((n, n), name="A", dtype="float16")
	BA = tvm.tir.decl_buffer(A.shape, A.dtype, scope="shared", data_alignment=32, offset_factor=256)
	C = te.compute((n, n), lambda i, j: A[i, j], name="C")
	BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=256)

	def intrin_func(ins, outs):
	ib = tvm.tir.ir_builder.create()

	BA = ins[0]
	BC = outs[0]
	ib.emit(
	tvm.tir.call_intrin(
	"handle",
	"tir.tvm_load_matrix_sync",
	BC.data,
	n,
	n,
	n,
	BC.elem_offset // 256,
	BA.access_ptr("r"),
	n,
	"row_major",
	)
	)
	return ib.get()

	return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})


	def intrin_wmma_gemm():
	n = 16
	A = te.placeholder((n, n), name="A", dtype="float16")
	B = te.placeholder((n, n), name="B", dtype="float16")
	k = te.reduce_axis((0, n), name="k")
	C = te.compute(
	(n, n),
	lambda ii, jj: te.sum(A[ii, k].astype("float") * B[k, jj].astype("float"), axis=k),
	name="C",
	)
	BA = tvm.tir.decl_buffer(
	A.shape, A.dtype, name="BA", scope="wmma.matrix_a", data_alignment=32, offset_factor=256
	)
	BB = tvm.tir.decl_buffer(
	B.shape, B.dtype, name="BB", scope="wmma.matrix_b", data_alignment=32, offset_factor=256
	)
	BC = tvm.tir.decl_buffer(
	C.shape, C.dtype, name="BC", scope="wmma.accumulator", data_alignment=32, offset_factor=256
	)

	def intrin_func(ins, outs):
	BA, BB = ins
	(BC,) = outs

	def init():
	ib = tvm.tir.ir_builder.create()
	ib.emit(
	tvm.tir.call_intrin(
	"handle", "tir.tvm_fill_fragment", BC.data, n, n, n, BC.elem_offset // 256, 0.0
	)
	)
	return ib.get()

	def update():
	ib = tvm.tir.ir_builder.create()
	ib.emit(
	tvm.tir.call_intrin(
	"handle",
	"tir.tvm_mma_sync",
	BC.data,
	BC.elem_offset // 256,
	BA.data,
	BA.elem_offset // 256,
	BB.data,
	BB.elem_offset // 256,
	BC.data,
	BC.elem_offset // 256,
	)
	)
	return ib.get()

	return update(), init(), update()

	return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})


	def intrin_wmma_store_matrix():
	n = 16
	A = te.placeholder((n, n), name="A", dtype="float32")
	BA = tvm.tir.decl_buffer(
	A.shape, A.dtype, scope="wmma.accumulator", data_alignment=32, offset_factor=256
	)
	C = te.compute((n, n), lambda i, j: A[i, j], name="C")
	BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope="global", data_alignment=32, offset_factor=256)

	def intrin_func(ins, outs):
	ib = tvm.tir.ir_builder.create()
	BA = ins[0]
	BC = outs[0]
	ib.emit(
	tvm.tir.call_intrin(
	"handle",
	"tir.tvm_store_matrix_sync",
	BA.data,
	n,
	n,
	n,
	BA.elem_offset // 256,
	BC.access_ptr("w"),
	n,
	"row_major",
	)
	)
	return ib.get()

	return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC})


	###############################################################################
	# Scheduling the Computation
	# --------------------------
	# To use TensorCores in TVM, we must schedule the computation into specific structure
	# to match the tensor intrinsic. The same as traditional GPU programs, we can also use
	# shared memory to boost the speed. If you have any questions about blocking and shared
	# memory, please refer :ref:`opt-conv-gpu`.
	#
	# In this example, each block contains 2x4 warps, and each warp calls 4x2 TensorCore
	# instructions. Thus, the output shape of each warp is 64x32 and each block outputs
	# 128x128 titles. Due to the limit of shared memory space, we only load 2 blocks (2x128x128 tiles)
	# one time.
	#
	# .. note::
	#
	# Warp-level Operation
	#
	# Note that all TensorCore instructions are warp-level instructions, which means all 32 threads
	# in a warp should do this instruction simultaneously. Making theadIdx.x extent=32 is one of the
	# easiest way to solve this. Then We can bind threadIdx.x to any loops except those contain
	# TensorCore intrinsics directly or indirectly. Also note that it is not the unique solution.
	# The only thing we should do is to make sure all threads in a warp can call TensorCore at the same time.

	# Define tiling sizes
	block_row_warps = 4
	block_col_warps = 2
	warp_row_tiles = 2
	warp_col_tiles = 4
	warp_size = 32
	chunk = 2

	block_x = te.thread_axis("blockIdx.x")
	block_y = te.thread_axis("blockIdx.y")
	block_z = te.thread_axis("blockIdx.z")
	thread_x = te.thread_axis("threadIdx.x")
	thread_y = te.thread_axis("threadIdx.y")
	thread_z = te.thread_axis("threadIdx.z")

	nc, hc, wc, oc, nnc, ooc = Conv.op.axis
	block_k = s[Conv].fuse(hc, wc)
	s[Conv].bind(block_k, block_z)
	nc, nci = s[Conv].split(nc, factor=warp_row_tiles)
	block_i, nc = s[Conv].split(nc, factor=block_row_warps)
	oc, oci = s[Conv].split(oc, factor=warp_col_tiles)
	block_j, oc = s[Conv].split(oc, factor=block_col_warps)
	s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc)
	s[Conv].bind(block_i, block_x)
	s[Conv].bind(block_j, block_y)
	s[Conv].bind(nc, thread_y)
	s[Conv].bind(oc, thread_z)

	# Schedule local computation
	s[ConvF].compute_at(s[Conv], oc)
	n, h, w, o, nnf, oof = ConvF.op.axis
	ko, ki = s[ConvF].split(ic, factor=chunk)
	s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)

	# Move intermediate computation into each output compute tile
	s[AF].compute_at(s[ConvF], kw)
	s[WF].compute_at(s[ConvF], kw)

	# Schedule for A's share memory
	s[AS].compute_at(s[ConvF], kh)
	n, h, w, i, nn, ii = AS.op.axis
	tx, xo = s[AS].split(n, nparts=block_row_warps)
	ty, yo = s[AS].split(xo, nparts=block_col_warps)
	t = s[AS].fuse(nn, ii)
	to, ti = s[AS].split(t, factor=warp_size)
	s[AS].bind(tx, thread_y)
	s[AS].bind(ty, thread_z)
	s[AS].bind(ti, thread_x)

	# Schedule for W's share memory
	s[WS].compute_at(s[ConvF], kh)
	kh, kw, ic, o, ii, oo = WS.op.axis
	tx, xo = s[WS].split(o, nparts=block_row_warps)
	ty, yo = s[WS].split(xo, nparts=block_col_warps)
	t = s[WS].fuse(ii, oo)
	to, ti = s[WS].split(t, nparts=warp_size)
	s[WS].bind(tx, thread_y)
	s[WS].bind(ty, thread_z)
	s[WS].bind(to, thread_x)
	s[WS].vectorize(ti)
	print(tvm.lower(s, [A, W, Conv], simple_mode=True))

	###############################################################################
	# Lowering Computation to Intrinsics
	# ----------------------------------
	# The last phase is to lower the computation loops down to TensorCore hardware intrinsics
	# by mapping the 2D convolution to tensor intrinsics

	s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix("wmma.matrix_a"))
	s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix("wmma.matrix_b"))
	s[Conv].tensorize(nnc, intrin_wmma_store_matrix())
	s[ConvF].tensorize(nnf, intrin_wmma_gemm())
	print(tvm.lower(s, [A, W, Conv], simple_mode=True))

	###############################################################################
	# Generate CUDA Kernel
	# --------------------
	# Finally we use TVM to generate and compile the CUDA kernel, and evaluate the latency of convolution.
	# Since TensorCores are only supported in NVIDIA GPU with Compute Capability 7.0 or higher, it may not
	# be able to run on our build server

	ctx = tvm.gpu(0)
	if nvcc.have_tensorcore(ctx.compute_version):
	with tvm.transform.PassContext(config={"tir.UnrollLoop": {"auto_max_step": 16}}):
	func = tvm.build(s, [A, W, Conv], "cuda")
	a_np = np.random.uniform(size=data_shape).astype(A.dtype)
	w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
	a = tvm.nd.array(a_np, ctx)
	w = tvm.nd.array(w_np, ctx)
	c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx)
	evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
	print("conv2d with tensor core: %f ms" % (evaluator(a, w, c).mean * 1e3))

	###############################################################################
	# Summary
	# -------
	# This tutorial demonstrates how TVM scheduling primitives can be used to
	# call TensorCores on specific GPUs.