tutorials/nnvm/using_external_lib.py - tvm - Git at Google

 """
 Using External Libraries in NNVM
 ================================
 **Author**: `Masahiro Masuda <https://github.com/masahi>`_

 This is a short tutorial on how to use external libraries such as cuDNN, or cuBLAS with NNVM.

 NNVM uses TVM internally to generate target specific code. For example, with cuda backend TVM generates cuda kernels for all layers in the user provided network.
 But sometimes it is also helpful to incorporate external libraries developed by various vendors into NNVM.
 Luckily, TVM has a mechanism to transparently call into these libraries.
 For NNVM users, all we need to do is just to set a target string appropriately.

 Before we can use external libraries from NNVM, your TVM needs to be built with libraries you want to use.
 For example, to use cuDNN, USE_CUDNN option in tvm/make/config.mk needs to be enabled, and cuDNN include and library directories need to be specified.

 To begin with, we import NNVM and TVM.
 """
 import tvm
 import numpy as np
 from tvm.contrib import graph_runtime as runtime
 import nnvm.symbol as sym
 import nnvm.compiler
 from nnvm.testing import utils

 ######################################################################
 # Create a simple network
 # -----------------------
 # Let's create a very simple network for demonstration.
 # It consists of convolution, batch normalization, and ReLU activation.

 out_channels = 16
 data = sym.Variable(name="data")
 simple_net = sym.conv2d(data=data, kernel_size=(3,3), channels=out_channels, padding = (1, 1), use_bias=True)
 simple_net = sym.batch_norm(data=simple_net)
 simple_net = sym.relu(data=simple_net)

 batch_size = 1
 data_shape = (batch_size, 3, 224, 224)
 net, params = utils.create_workload(simple_net, batch_size, data_shape[1:])

 ######################################################################
 # Build and run with cuda backend
 # -------------------------------
 # We build and run this network with cuda backend, as usual.
 # By setting the logging level to DEBUG, the result of NNVM graph compilation will be dumped as pseudo code.
 import logging
 logging.basicConfig(level=logging.DEBUG) # to dump TVM IR after fusion

 target = "cuda"
 graph, lib, params = nnvm.compiler.build(
     net, target, shape={"data": data_shape}, params=params)

 ctx = tvm.context(target, 0)
 data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
 module = runtime.create(graph, lib, ctx)
 module.set_input(**params)
 module.set_input("data", data)
 module.run()
 out_shape = (batch_size, out_channels, 224, 224)
 out = module.get_output(0, tvm.nd.empty(out_shape))
 out_cuda = out.asnumpy()

 ######################################################################
 # The generated pseudo code should look something like below.
 # Note how bias add, batch normalization, and ReLU activation are fused into the convolution kernel.
 # TVM generates a single, fused kernel from this representation.
 #
 # .. code-block:: text
 #
 #       produce compute {
 #         // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 112
 #         // attr [input1.shared] storage_scope = "shared"
 #         allocate input1.shared[float32 * 16 * 3 * 3 * 3]
 #         // attr [compute] storage_scope = "local"
 #         allocate compute[float32 * 16 * 1 * 1 * 1 * 1]
 #         // attr [pad_temp.global.global.shared] storage_scope = "shared"
 #         allocate pad_temp.global.global.shared[float32 * 1 * 1 * 4 * 57 * 4]
 #         // attr [iter_var(threadIdx.x, Range(min=0, extent=448), threadIdx.x)] thread_extent = 448
 #         produce compute {
 #           produce input1.shared {
 #             for (ax0, 0, 16) {
 #               if (likely((threadIdx.x < 27))) {
 #                 input1.shared[(threadIdx.x + (ax0*27))] = input1[((((((blockIdx.x/112)*48) + (threadIdx.x/9))*9) + (threadIdx.x % 9)) + (ax0*27))]
 #               }
 #             }
 #           }
 #           compute[0] = 0.000000f
 #           compute[1] = 0.000000f
 #           compute[2] = 0.000000f
 #           compute[3] = 0.000000f
 #           compute[4] = 0.000000f
 #           compute[5] = 0.000000f
 #           compute[6] = 0.000000f
 #           compute[7] = 0.000000f
 #           compute[8] = 0.000000f
 #           compute[9] = 0.000000f
 #           compute[10] = 0.000000f
 #           compute[11] = 0.000000f
 #           compute[12] = 0.000000f
 #           compute[13] = 0.000000f
 #           compute[14] = 0.000000f
 #           compute[15] = 0.000000f
 #           for (rc, 0, 3) {
 #             produce pad_temp.global.global.shared {
 #               if (likely((threadIdx.x < 228))) {
 #                 if (likely(((blockIdx.x*2) < (226 - (threadIdx.x/57))))) {
 #                   pad_temp.global.global.shared[ramp((threadIdx.x*4), 1, 4)] = pad_temp[ramp(((((((blockIdx.x*2) + (threadIdx.x/57))*57) + (threadIdx.x % 57)) + (rc*12882))*4), 1, 4)]
 #                 }
 #               }
 #             }
 #             for (ry, 0, 3) {
 #               for (rx, 0, 3) {
 #                 compute[0] = (compute[0] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[((((rc*3) + ry)*3) + rx)]))
 #                 compute[1] = (compute[1] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 27)]))
 #                 compute[2] = (compute[2] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 54)]))
 #                 compute[3] = (compute[3] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 81)]))
 #                 compute[4] = (compute[4] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 108)]))
 #                 compute[5] = (compute[5] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 135)]))
 #                 compute[6] = (compute[6] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 162)]))
 #                 compute[7] = (compute[7] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 189)]))
 #                 compute[8] = (compute[8] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 216)]))
 #                 compute[9] = (compute[9] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 243)]))
 #                 compute[10] = (compute[10] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 270)]))
 #                 compute[11] = (compute[11] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 297)]))
 #                 compute[12] = (compute[12] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 324)]))
 #                 compute[13] = (compute[13] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 351)]))
 #                 compute[14] = (compute[14] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 378)]))
 #                 compute[15] = (compute[15] + (pad_temp.global.global.shared[(((((threadIdx.x/224)*228) + (threadIdx.x % 224)) + (ry*228)) + rx)]*input1.shared[(((((rc*3) + ry)*3) + rx) + 405)]))
 #               }
 #             }
 #           }
 #         }
 #         compute[(((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224))] = max((((compute[0] + input2[((blockIdx.x/112)*16)])*input3[((blockIdx.x/112)*16)]) + input4[((blockIdx.x/112)*16)]), 0.000000f)
 #         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 50176)] = max((((compute[1] + input2[(((blockIdx.x/112)*16) + 1)])*input3[(((blockIdx.x/112)*16) + 1)]) + input4[(((blockIdx.x/112)*16) + 1)]), 0.000000f)
 #         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 100352)] = max((((compute[2] + input2[(((blockIdx.x/112)*16) + 2)])*input3[(((blockIdx.x/112)*16) + 2)]) + input4[(((blockIdx.x/112)*16) + 2)]), 0.000000f)
 #         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 150528)] = max((((compute[3] + input2[(((blockIdx.x/112)*16) + 3)])*input3[(((blockIdx.x/112)*16) + 3)]) + input4[(((blockIdx.x/112)*16) + 3)]), 0.000000f)
 #         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 200704)] = max((((compute[4] + input2[(((blockIdx.x/112)*16) + 4)])*input3[(((blockIdx.x/112)*16) + 4)]) + input4[(((blockIdx.x/112)*16) + 4)]), 0.000000f)
 #         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 250880)] = max((((compute[5] + input2[(((blockIdx.x/112)*16) + 5)])*input3[(((blockIdx.x/112)*16) + 5)]) + input4[(((blockIdx.x/112)*16) + 5)]), 0.000000f)
 #         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 301056)] = max((((compute[6] + input2[(((blockIdx.x/112)*16) + 6)])*input3[(((blockIdx.x/112)*16) + 6)]) + input4[(((blockIdx.x/112)*16) + 6)]), 0.000000f)
 #         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 351232)] = max((((compute[7] + input2[(((blockIdx.x/112)*16) + 7)])*input3[(((blockIdx.x/112)*16) + 7)]) + input4[(((blockIdx.x/112)*16) + 7)]), 0.000000f)
 #         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 401408)] = max((((compute[8] + input2[(((blockIdx.x/112)*16) + 8)])*input3[(((blockIdx.x/112)*16) + 8)]) + input4[(((blockIdx.x/112)*16) + 8)]), 0.000000f)
 #         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 451584)] = max((((compute[9] + input2[(((blockIdx.x/112)*16) + 9)])*input3[(((blockIdx.x/112)*16) + 9)]) + input4[(((blockIdx.x/112)*16) + 9)]), 0.000000f)
 #         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 501760)] = max((((compute[10] + input2[(((blockIdx.x/112)*16) + 10)])*input3[(((blockIdx.x/112)*16) + 10)]) + input4[(((blockIdx.x/112)*16) + 10)]), 0.000000f)
 #         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 551936)] = max((((compute[11] + input2[(((blockIdx.x/112)*16) + 11)])*input3[(((blockIdx.x/112)*16) + 11)]) + input4[(((blockIdx.x/112)*16) + 11)]), 0.000000f)
 #         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 602112)] = max((((compute[12] + input2[(((blockIdx.x/112)*16) + 12)])*input3[(((blockIdx.x/112)*16) + 12)]) + input4[(((blockIdx.x/112)*16) + 12)]), 0.000000f)
 #         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 652288)] = max((((compute[13] + input2[(((blockIdx.x/112)*16) + 13)])*input3[(((blockIdx.x/112)*16) + 13)]) + input4[(((blockIdx.x/112)*16) + 13)]), 0.000000f)
 #         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 702464)] = max((((compute[14] + input2[(((blockIdx.x/112)*16) + 14)])*input3[(((blockIdx.x/112)*16) + 14)]) + input4[(((blockIdx.x/112)*16) + 14)]), 0.000000f)
 #         compute[((((((blockIdx.x + ((blockIdx.x/112)*1792))*2) + (threadIdx.x/224))*224) + (threadIdx.x % 224)) + 752640)] = max((((compute[15] + input2[(((blockIdx.x/112)*16) + 15)])*input3[(((blockIdx.x/112)*16) + 15)]) + input4[(((blockIdx.x/112)*16) + 15)]), 0.000000f)
 #       }
 #

 ######################################################################
 # Use cuDNN for a convolutional layer
 # -----------------------------------
 # We can use cuDNN to replace convolution kernels with cuDNN ones.
 # To do that, all we need to do is to append the option " -libs=cudnn" to the target string.
 net, params = utils.create_workload(simple_net, batch_size, data_shape[1:])
 target = "cuda -libs=cudnn" # use cudnn for convolution
 graph, lib, params = nnvm.compiler.build(
     net, target, shape={"data": data_shape}, params=params)

 ctx = tvm.context(target, 0)
 data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
 module = runtime.create(graph, lib, ctx)
 module.set_input(**params)
 module.set_input("data", data)
 module.run()
 out_shape = (batch_size, out_channels, 224, 224)
 out = module.get_output(0, tvm.nd.empty(out_shape))
 out_cudnn = out.asnumpy()

 ######################################################################
 # Note that if you use cuDNN, NNVM cannot fuse convolution with layers following it.
 # This is because layer fusion happens at the level of TVM internal representation(IR).
 # NNVM treats external libraries as black box, so there is no way to fuse them with TVM IR.
 #
 # The pseudo code below shows that cuDNN convolution + bias add + batch norm + ReLU turned into two stages of computation, one for cuDNN call and the other for the rest of operations.
 #
 # .. code-block:: text
 #
 #       allocate y[float32 * 1 * 16 * 224 * 224]
 #       produce y {
 #          // attr [0] extern_scope = 0
 #          tvm_call_packed("tvm.contrib.cudnn.conv2d.forward", 1, 0, 1, 1, 1, 1, 1, 1, 1, tvm_stack_make_array(input0, tvm_stack_make_shape(1, 3, 224, 224), 0, 4, 0.000000f, 0), tvm_stack_make_array(input1, tvm_stack_make_shape(16, 3, 3, 3), 0, 4, 0.000000f, 0), tvm_stack_make_array(y, tvm_stack_make_shape(1, 16, 224, 224), 0, 4, 0.000000f, 0))
 #        }
 #       produce compute {
 #          // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 1568
 #          // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 512
 #          compute[((((((blockIdx.x*512) + threadIdx.x)/50176) + ((((blockIdx.x*512) + threadIdx.x)/802816)*16))*50176) + ((((((blockIdx.x*512) + threadIdx.x)/224) % 224)*224) + (((blockIdx.x*64) + threadIdx.x) % 224)))] = max((((y[((((((blockIdx.x*512) + threadIdx.x)/50176) + ((((blockIdx.x*512) + threadIdx.x)/802816)*16))*50176) + ((((((blockIdx.x*512) + threadIdx.x)/224) % 224)*224) + (((blockIdx.x*64) + threadIdx.x) % 224)))] + input2[(((blockIdx.x*512) + threadIdx.x)/50176)])*input3[(((blockIdx.x*512) + threadIdx.x)/50176)]) + input4[(((blockIdx.x*512) + threadIdx.x)/50176)]), 0.000000f)
 #        }
 #

 ######################################################################
 # Verify the result
 # -----------------
 # We can check that the results of two runs match.

 tvm.testing.assert_allclose(out_cuda, out_cudnn, rtol=1e-5)

 #####################################################################
 # Conclusion
 # ----------
 # This tutorial covered the usage of cuDNN with NNVM.
 # We also have support for cuBLAS. If cuBLAS is enabled, it will be used inside a fully connected layer (nnvm.symbol.dense).
 # To use cuBLAS, set a target string as "cuda -libs=cublas".
 # You can use both cuDNN and cuBLAS with "cuda -libs=cudnn,cublas".
 #
 # For ROCm backend, we have support for MIOpen and rocBLAS.
 # They can be enabled with target "rocm -libs=miopen,rocblas".
 #
 # Being able to use external libraries is great, but we need to keep in mind some cautions.
 #
 # First, the use of external libraries may restrict your usage of TVM and NNVM.
 # For example, MIOpen only supports NCHW layout and fp32 data type at the moment, so you cannot use other layouts or data type in TVM.
 #
 # Second, and more importantly, external libraries restrict the possibility of operator fusion during graph compilation, as shown above.
 # TVM and NNVM aim to achieve the best performance on a variety of hardwares, with joint operator level and graph level optimization.
 # To achieve this goal, we should continue developing better optimizations for TVM and NNVM, while using external libraries as a nice way to fall back to existing implementation when necessary.
	"""
	Using External Libraries in NNVM
	================================
	Author: `Masahiro Masuda <https://github.com/masahi>`_

	This is a short tutorial on how to use external libraries such as cuDNN, or cuBLAS with NNVM.

	NNVM uses TVM internally to generate target specific code. For example, with cuda backend TVM generates cuda kernels for all layers in the user provided network.
	But sometimes it is also helpful to incorporate external libraries developed by various vendors into NNVM.
	Luckily, TVM has a mechanism to transparently call into these libraries.
	For NNVM users, all we need to do is just to set a target string appropriately.

	Before we can use external libraries from NNVM, your TVM needs to be built with libraries you want to use.
	For example, to use cuDNN, USE_CUDNN option in tvm/make/config.mk needs to be enabled, and cuDNN include and library directories need to be specified.

	To begin with, we import NNVM and TVM.
	"""
	import tvm
	import numpy as np
	from tvm.contrib import graph_runtime as runtime
	import nnvm.symbol as sym
	import nnvm.compiler
	from nnvm.testing import utils

	######################################################################
	# Create a simple network
	# -----------------------
	# Let's create a very simple network for demonstration.
	# It consists of convolution, batch normalization, and ReLU activation.

	out_channels = 16
	data = sym.Variable(name="data")
	simple_net = sym.conv2d(data=data, kernel_size=(3,3), channels=out_channels, padding = (1, 1), use_bias=True)
	simple_net = sym.batch_norm(data=simple_net)
	simple_net = sym.relu(data=simple_net)

	batch_size = 1
	data_shape = (batch_size, 3, 224, 224)
	net, params = utils.create_workload(simple_net, batch_size, data_shape[1:])

	######################################################################
	# Build and run with cuda backend
	# -------------------------------
	# We build and run this network with cuda backend, as usual.
	# By setting the logging level to DEBUG, the result of NNVM graph compilation will be dumped as pseudo code.
	import logging
	logging.basicConfig(level=logging.DEBUG) # to dump TVM IR after fusion

	target = "cuda"
	graph, lib, params = nnvm.compiler.build(
	net, target, shape={"data": data_shape}, params=params)

	ctx = tvm.context(target, 0)
	data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
	module = runtime.create(graph, lib, ctx)
	module.set_input(**params)
	module.set_input("data", data)
	module.run()
	out_shape = (batch_size, out_channels, 224, 224)
	out = module.get_output(0, tvm.nd.empty(out_shape))
	out_cuda = out.asnumpy()

	######################################################################
	# The generated pseudo code should look something like below.
	# Note how bias add, batch normalization, and ReLU activation are fused into the convolution kernel.
	# TVM generates a single, fused kernel from this representation.
	#
	# .. code-block:: text
	#
	# produce compute {
	# // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 112
	# // attr [input1.shared] storage_scope = "shared"
	# allocate input1.shared[float32 * 16 * 3 * 3 * 3]
	# // attr [compute] storage_scope = "local"
	# allocate compute[float32 * 16 * 1 * 1 * 1 * 1]
	# // attr [pad_temp.global.global.shared] storage_scope = "shared"
	# allocate pad_temp.global.global.shared[float32 * 1 * 1 * 4 * 57 * 4]
	# // attr [iter_var(threadIdx.x, Range(min=0, extent=448), threadIdx.x)] thread_extent = 448
	# produce compute {
	# produce input1.shared {
	# for (ax0, 0, 16) {
	# if (likely((threadIdx.x < 27))) {
	# input1.shared[(threadIdx.x + (ax027))] = input1[((((((blockIdx.x/112)48) + (threadIdx.x/9))9) + (threadIdx.x % 9)) + (ax027))]
	# }
	# }
	# }
	# compute[0] = 0.000000f
	# compute[1] = 0.000000f
	# compute[2] = 0.000000f
	# compute[3] = 0.000000f
	# compute[4] = 0.000000f
	# compute[5] = 0.000000f
	# compute[6] = 0.000000f
	# compute[7] = 0.000000f
	# compute[8] = 0.000000f
	# compute[9] = 0.000000f
	# compute[10] = 0.000000f
	# compute[11] = 0.000000f
	# compute[12] = 0.000000f
	# compute[13] = 0.000000f
	# compute[14] = 0.000000f
	# compute[15] = 0.000000f
	# for (rc, 0, 3) {
	# produce pad_temp.global.global.shared {
	# if (likely((threadIdx.x < 228))) {
	# if (likely(((blockIdx.x*2) < (226 - (threadIdx.x/57))))) {
	# pad_temp.global.global.shared[ramp((threadIdx.x4), 1, 4)] = pad_temp[ramp(((((((blockIdx.x2) + (threadIdx.x/57))57) + (threadIdx.x % 57)) + (rc12882))*4), 1, 4)]
	# }
	# }
	# }
	# for (ry, 0, 3) {
	# for (rx, 0, 3) {
	# compute[0] = (compute[0] + (pad_temp.global.global.shared[(((((threadIdx.x/224)228) + (threadIdx.x % 224)) + (ry228)) + rx)]input1.shared[((((rc3) + ry)*3) + rx)]))
	# compute[1] = (compute[1] + (pad_temp.global.global.shared[(((((threadIdx.x/224)228) + (threadIdx.x % 224)) + (ry228)) + rx)]input1.shared[(((((rc3) + ry)*3) + rx) + 27)]))
	# compute[2] = (compute[2] + (pad_temp.global.global.shared[(((((threadIdx.x/224)228) + (threadIdx.x % 224)) + (ry228)) + rx)]input1.shared[(((((rc3) + ry)*3) + rx) + 54)]))
	# compute[3] = (compute[3] + (pad_temp.global.global.shared[(((((threadIdx.x/224)228) + (threadIdx.x % 224)) + (ry228)) + rx)]input1.shared[(((((rc3) + ry)*3) + rx) + 81)]))
	# compute[4] = (compute[4] + (pad_temp.global.global.shared[(((((threadIdx.x/224)228) + (threadIdx.x % 224)) + (ry228)) + rx)]input1.shared[(((((rc3) + ry)*3) + rx) + 108)]))
	# compute[5] = (compute[5] + (pad_temp.global.global.shared[(((((threadIdx.x/224)228) + (threadIdx.x % 224)) + (ry228)) + rx)]input1.shared[(((((rc3) + ry)*3) + rx) + 135)]))
	# compute[6] = (compute[6] + (pad_temp.global.global.shared[(((((threadIdx.x/224)228) + (threadIdx.x % 224)) + (ry228)) + rx)]input1.shared[(((((rc3) + ry)*3) + rx) + 162)]))
	# compute[7] = (compute[7] + (pad_temp.global.global.shared[(((((threadIdx.x/224)228) + (threadIdx.x % 224)) + (ry228)) + rx)]input1.shared[(((((rc3) + ry)*3) + rx) + 189)]))
	# compute[8] = (compute[8] + (pad_temp.global.global.shared[(((((threadIdx.x/224)228) + (threadIdx.x % 224)) + (ry228)) + rx)]input1.shared[(((((rc3) + ry)*3) + rx) + 216)]))
	# compute[9] = (compute[9] + (pad_temp.global.global.shared[(((((threadIdx.x/224)228) + (threadIdx.x % 224)) + (ry228)) + rx)]input1.shared[(((((rc3) + ry)*3) + rx) + 243)]))
	# compute[10] = (compute[10] + (pad_temp.global.global.shared[(((((threadIdx.x/224)228) + (threadIdx.x % 224)) + (ry228)) + rx)]input1.shared[(((((rc3) + ry)*3) + rx) + 270)]))
	# compute[11] = (compute[11] + (pad_temp.global.global.shared[(((((threadIdx.x/224)228) + (threadIdx.x % 224)) + (ry228)) + rx)]input1.shared[(((((rc3) + ry)*3) + rx) + 297)]))
	# compute[12] = (compute[12] + (pad_temp.global.global.shared[(((((threadIdx.x/224)228) + (threadIdx.x % 224)) + (ry228)) + rx)]input1.shared[(((((rc3) + ry)*3) + rx) + 324)]))
	# compute[13] = (compute[13] + (pad_temp.global.global.shared[(((((threadIdx.x/224)228) + (threadIdx.x % 224)) + (ry228)) + rx)]input1.shared[(((((rc3) + ry)*3) + rx) + 351)]))
	# compute[14] = (compute[14] + (pad_temp.global.global.shared[(((((threadIdx.x/224)228) + (threadIdx.x % 224)) + (ry228)) + rx)]input1.shared[(((((rc3) + ry)*3) + rx) + 378)]))
	# compute[15] = (compute[15] + (pad_temp.global.global.shared[(((((threadIdx.x/224)228) + (threadIdx.x % 224)) + (ry228)) + rx)]input1.shared[(((((rc3) + ry)*3) + rx) + 405)]))
	# }
	# }
	# }
	# }
	# compute[(((((blockIdx.x + ((blockIdx.x/112)1792))2) + (threadIdx.x/224))224) + (threadIdx.x % 224))] = max((((compute[0] + input2[((blockIdx.x/112)16)])input3[((blockIdx.x/112)16)]) + input4[((blockIdx.x/112)*16)]), 0.000000f)
	# compute[((((((blockIdx.x + ((blockIdx.x/112)1792))2) + (threadIdx.x/224))224) + (threadIdx.x % 224)) + 50176)] = max((((compute[1] + input2[(((blockIdx.x/112)16) + 1)])input3[(((blockIdx.x/112)16) + 1)]) + input4[(((blockIdx.x/112)*16) + 1)]), 0.000000f)
	# compute[((((((blockIdx.x + ((blockIdx.x/112)1792))2) + (threadIdx.x/224))224) + (threadIdx.x % 224)) + 100352)] = max((((compute[2] + input2[(((blockIdx.x/112)16) + 2)])input3[(((blockIdx.x/112)16) + 2)]) + input4[(((blockIdx.x/112)*16) + 2)]), 0.000000f)
	# compute[((((((blockIdx.x + ((blockIdx.x/112)1792))2) + (threadIdx.x/224))224) + (threadIdx.x % 224)) + 150528)] = max((((compute[3] + input2[(((blockIdx.x/112)16) + 3)])input3[(((blockIdx.x/112)16) + 3)]) + input4[(((blockIdx.x/112)*16) + 3)]), 0.000000f)
	# compute[((((((blockIdx.x + ((blockIdx.x/112)1792))2) + (threadIdx.x/224))224) + (threadIdx.x % 224)) + 200704)] = max((((compute[4] + input2[(((blockIdx.x/112)16) + 4)])input3[(((blockIdx.x/112)16) + 4)]) + input4[(((blockIdx.x/112)*16) + 4)]), 0.000000f)
	# compute[((((((blockIdx.x + ((blockIdx.x/112)1792))2) + (threadIdx.x/224))224) + (threadIdx.x % 224)) + 250880)] = max((((compute[5] + input2[(((blockIdx.x/112)16) + 5)])input3[(((blockIdx.x/112)16) + 5)]) + input4[(((blockIdx.x/112)*16) + 5)]), 0.000000f)
	# compute[((((((blockIdx.x + ((blockIdx.x/112)1792))2) + (threadIdx.x/224))224) + (threadIdx.x % 224)) + 301056)] = max((((compute[6] + input2[(((blockIdx.x/112)16) + 6)])input3[(((blockIdx.x/112)16) + 6)]) + input4[(((blockIdx.x/112)*16) + 6)]), 0.000000f)
	# compute[((((((blockIdx.x + ((blockIdx.x/112)1792))2) + (threadIdx.x/224))224) + (threadIdx.x % 224)) + 351232)] = max((((compute[7] + input2[(((blockIdx.x/112)16) + 7)])input3[(((blockIdx.x/112)16) + 7)]) + input4[(((blockIdx.x/112)*16) + 7)]), 0.000000f)
	# compute[((((((blockIdx.x + ((blockIdx.x/112)1792))2) + (threadIdx.x/224))224) + (threadIdx.x % 224)) + 401408)] = max((((compute[8] + input2[(((blockIdx.x/112)16) + 8)])input3[(((blockIdx.x/112)16) + 8)]) + input4[(((blockIdx.x/112)*16) + 8)]), 0.000000f)
	# compute[((((((blockIdx.x + ((blockIdx.x/112)1792))2) + (threadIdx.x/224))224) + (threadIdx.x % 224)) + 451584)] = max((((compute[9] + input2[(((blockIdx.x/112)16) + 9)])input3[(((blockIdx.x/112)16) + 9)]) + input4[(((blockIdx.x/112)*16) + 9)]), 0.000000f)
	# compute[((((((blockIdx.x + ((blockIdx.x/112)1792))2) + (threadIdx.x/224))224) + (threadIdx.x % 224)) + 501760)] = max((((compute[10] + input2[(((blockIdx.x/112)16) + 10)])input3[(((blockIdx.x/112)16) + 10)]) + input4[(((blockIdx.x/112)*16) + 10)]), 0.000000f)
	# compute[((((((blockIdx.x + ((blockIdx.x/112)1792))2) + (threadIdx.x/224))224) + (threadIdx.x % 224)) + 551936)] = max((((compute[11] + input2[(((blockIdx.x/112)16) + 11)])input3[(((blockIdx.x/112)16) + 11)]) + input4[(((blockIdx.x/112)*16) + 11)]), 0.000000f)
	# compute[((((((blockIdx.x + ((blockIdx.x/112)1792))2) + (threadIdx.x/224))224) + (threadIdx.x % 224)) + 602112)] = max((((compute[12] + input2[(((blockIdx.x/112)16) + 12)])input3[(((blockIdx.x/112)16) + 12)]) + input4[(((blockIdx.x/112)*16) + 12)]), 0.000000f)
	# compute[((((((blockIdx.x + ((blockIdx.x/112)1792))2) + (threadIdx.x/224))224) + (threadIdx.x % 224)) + 652288)] = max((((compute[13] + input2[(((blockIdx.x/112)16) + 13)])input3[(((blockIdx.x/112)16) + 13)]) + input4[(((blockIdx.x/112)*16) + 13)]), 0.000000f)
	# compute[((((((blockIdx.x + ((blockIdx.x/112)1792))2) + (threadIdx.x/224))224) + (threadIdx.x % 224)) + 702464)] = max((((compute[14] + input2[(((blockIdx.x/112)16) + 14)])input3[(((blockIdx.x/112)16) + 14)]) + input4[(((blockIdx.x/112)*16) + 14)]), 0.000000f)
	# compute[((((((blockIdx.x + ((blockIdx.x/112)1792))2) + (threadIdx.x/224))224) + (threadIdx.x % 224)) + 752640)] = max((((compute[15] + input2[(((blockIdx.x/112)16) + 15)])input3[(((blockIdx.x/112)16) + 15)]) + input4[(((blockIdx.x/112)*16) + 15)]), 0.000000f)
	# }
	#

	######################################################################
	# Use cuDNN for a convolutional layer
	# -----------------------------------
	# We can use cuDNN to replace convolution kernels with cuDNN ones.
	# To do that, all we need to do is to append the option " -libs=cudnn" to the target string.
	net, params = utils.create_workload(simple_net, batch_size, data_shape[1:])
	target = "cuda -libs=cudnn" # use cudnn for convolution
	graph, lib, params = nnvm.compiler.build(
	net, target, shape={"data": data_shape}, params=params)

	ctx = tvm.context(target, 0)
	data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
	module = runtime.create(graph, lib, ctx)
	module.set_input(**params)
	module.set_input("data", data)
	module.run()
	out_shape = (batch_size, out_channels, 224, 224)
	out = module.get_output(0, tvm.nd.empty(out_shape))
	out_cudnn = out.asnumpy()

	######################################################################
	# Note that if you use cuDNN, NNVM cannot fuse convolution with layers following it.
	# This is because layer fusion happens at the level of TVM internal representation(IR).
	# NNVM treats external libraries as black box, so there is no way to fuse them with TVM IR.
	#
	# The pseudo code below shows that cuDNN convolution + bias add + batch norm + ReLU turned into two stages of computation, one for cuDNN call and the other for the rest of operations.
	#
	# .. code-block:: text
	#
	# allocate y[float32 * 1 * 16 * 224 * 224]
	# produce y {
	# // attr [0] extern_scope = 0
	# tvm_call_packed("tvm.contrib.cudnn.conv2d.forward", 1, 0, 1, 1, 1, 1, 1, 1, 1, tvm_stack_make_array(input0, tvm_stack_make_shape(1, 3, 224, 224), 0, 4, 0.000000f, 0), tvm_stack_make_array(input1, tvm_stack_make_shape(16, 3, 3, 3), 0, 4, 0.000000f, 0), tvm_stack_make_array(y, tvm_stack_make_shape(1, 16, 224, 224), 0, 4, 0.000000f, 0))
	# }
	# produce compute {
	# // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 1568
	# // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 512
	# compute[((((((blockIdx.x512) + threadIdx.x)/50176) + ((((blockIdx.x512) + threadIdx.x)/802816)16))50176) + ((((((blockIdx.x512) + threadIdx.x)/224) % 224)224) + (((blockIdx.x64) + threadIdx.x) % 224)))] = max((((y[((((((blockIdx.x512) + threadIdx.x)/50176) + ((((blockIdx.x512) + threadIdx.x)/802816)16))50176) + ((((((blockIdx.x512) + threadIdx.x)/224) % 224)224) + (((blockIdx.x64) + threadIdx.x) % 224)))] + input2[(((blockIdx.x512) + threadIdx.x)/50176)])input3[(((blockIdx.x512) + threadIdx.x)/50176)]) + input4[(((blockIdx.x512) + threadIdx.x)/50176)]), 0.000000f)
	# }
	#

	######################################################################
	# Verify the result
	# -----------------
	# We can check that the results of two runs match.

	tvm.testing.assert_allclose(out_cuda, out_cudnn, rtol=1e-5)

	#####################################################################
	# Conclusion
	# ----------
	# This tutorial covered the usage of cuDNN with NNVM.
	# We also have support for cuBLAS. If cuBLAS is enabled, it will be used inside a fully connected layer (nnvm.symbol.dense).
	# To use cuBLAS, set a target string as "cuda -libs=cublas".
	# You can use both cuDNN and cuBLAS with "cuda -libs=cudnn,cublas".
	#
	# For ROCm backend, we have support for MIOpen and rocBLAS.
	# They can be enabled with target "rocm -libs=miopen,rocblas".
	#
	# Being able to use external libraries is great, but we need to keep in mind some cautions.
	#
	# First, the use of external libraries may restrict your usage of TVM and NNVM.
	# For example, MIOpen only supports NCHW layout and fp32 data type at the moment, so you cannot use other layouts or data type in TVM.
	#
	# Second, and more importantly, external libraries restrict the possibility of operator fusion during graph compilation, as shown above.
	# TVM and NNVM aim to achieve the best performance on a variety of hardwares, with joint operator level and graph level optimization.
	# To achieve this goal, we should continue developing better optimizations for TVM and NNVM, while using external libraries as a nice way to fall back to existing implementation when necessary.