topi/tests/python/test_topi_depthwise_conv2d.py - tvm - Git at Google

 import tvm
 from tvm import autotvm
 import topi
 import topi.testing
 import numpy as np
 from topi.util import get_const_tuple
 from topi.nn.util import get_pad_tuple
 from tvm.contrib.pickle_memoize import memoize

 from common import get_all_backend

 def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1):
     in_width = in_height
     filter_channel = in_channel
     filter_width = filter_height
     stride_h = stride_w = stride

     if dilation == 1:
         # here we transform the padding argument from 'str' to  'tuple' ,
         # because we need this to match the "workload" tuple to the records in TopHub
         pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width))
         padding_args = (pad_h, pad_w)
     else:
         padding_args = padding

     # placeholder
     Input = tvm.placeholder((batch, in_channel, in_height, in_width), name='Input')
     Filter = tvm.placeholder((filter_channel, channel_multiplier, filter_height, filter_width), name='Filter')
     Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
     Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')

     dtype = 'float32'

     def check_device(device):
         ctx = tvm.context(device, 0)
         if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             # declare
             DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, Filter,
                 (stride_h, stride_w), padding_args, dilation, dtype)
             ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
             Relu = topi.nn.relu(ScaleShift)
             # schedule
             s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
             s2 = topi.generic.schedule_depthwise_conv2d_nchw(ScaleShift)
             s3 = topi.generic.schedule_depthwise_conv2d_nchw(Relu)
         # build the kernels
         f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
         f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
         f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)

         # Prepare pod type for test data closure
         input_shape = get_const_tuple(Input.shape)
         filter_shape = get_const_tuple(Filter.shape)
         scale_shape = get_const_tuple(Scale.shape)
         shift_shape = get_const_tuple(Shift.shape)
         scale_shift_shape = get_const_tuple(ScaleShift.shape)

         # Use memoize, pickle the test data for next time use.
         @memoize("topi.tests.test_topi_depthwise_conv2d.nchw")
         def get_ref_data():
             input_np = np.random.uniform(size=input_shape).astype(dtype)
             filter_np = np.random.uniform(size=filter_shape).astype(dtype)
             dilated_filter_np = topi.testing.dilate_python(filter_np, (1, 1, dilation, dilation))
             scale_np = np.random.uniform(size=scale_shape).astype(dtype)
             shift_np = np.random.uniform(size=shift_shape).astype(dtype)
             # correctness with scipy
             depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(
                 input_np, dilated_filter_np, stride, padding)
             scale_shift_scipy = np.zeros(shape=scale_shift_shape)
             for c in range(in_channel * channel_multiplier):
                 scale_shift_scipy[:,c,:,:] = depthwise_conv2d_scipy[:,c,:,:] * scale_np[c] + shift_np[c]
                 relu_scipy = np.maximum(scale_shift_scipy, 0)
             return (input_np, filter_np, scale_np, shift_np,
                     depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy)
         # Get the test data
         (input_np, filter_np, scale_np, shift_np,
          depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) = get_ref_data()

         input_tvm = tvm.nd.array(input_np, ctx)
         filter_tvm = tvm.nd.array(filter_np, ctx)
         scale_tvm = tvm.nd.array(scale_np, ctx)
         shift_tvm = tvm.nd.array(shift_np, ctx)
         depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx)
         scale_shift_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx)
         relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
         # launch kernel 1 (depthwise_conv2d)
         timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1)
         tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
         # launch kernel 2 (depthwise_conv2d + scale_shift)
         timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1)
         tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean
         # launch kernel 3 (depthwise_conv2d + scale_shift + relu)
         timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1)
         tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
         tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
         tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
         tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)

     for device in get_all_backend():
         with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
             check_device(device)


 def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_multiplier, filter_height, stride_h, padding, dilation=1):
     in_width = in_height
     filter_channel = in_channel
     filter_width = filter_height
     stride_w = stride_h

     if dilation == 1:
         # here we transform the padding argument from 'str' to  'tuple' ,
         # because we need this to match the "workload" tuple to the records in TopHub
         pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width))
         padding_args = (pad_h, pad_w)
     else:
         padding_args = padding

     # placeholder
     Input = tvm.placeholder((batch, in_height, in_width, in_channel), name='Input')
     Filter = tvm.placeholder((filter_height, filter_width,filter_channel, channel_multiplier), name='Filter')
     Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
     Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')

     dtype = 'float32'

     def check_device(device):
         ctx = tvm.context(device, 0)
         if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)

         with tvm.target.create(device):
             # declare
             DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, Filter,
                 (stride_h, stride_w), padding_args, dilation, dtype)
             ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift)
             Relu = topi.nn.relu(ScaleShift)
             # schedule
             s1 = topi.generic.schedule_depthwise_conv2d_nhwc(DepthwiseConv2d)
             s2 = topi.generic.schedule_depthwise_conv2d_nhwc(ScaleShift)
             s3 = topi.generic.schedule_depthwise_conv2d_nhwc(Relu)
         # build the kernels
         f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
         f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
         f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)

         # Prepare pod type for test data closure
         input_shape = get_const_tuple(Input.shape)
         filter_shape = get_const_tuple(Filter.shape)
         scale_shape = get_const_tuple(Scale.shape)
         shift_shape = get_const_tuple(Shift.shape)
         scale_shift_shape = get_const_tuple(ScaleShift.shape)

         # Use memoize, pickle the test data for next time use.
         @memoize("topi.tests.test_topi_depthwise_conv2d.nhwc.v2")
         def get_ref_data():
             input_np = np.random.uniform(size=input_shape).astype(dtype)
             filter_np = np.random.uniform(size=filter_shape).astype(dtype)
             dilated_filter_np = topi.testing.dilate_python(filter_np, (dilation, dilation, 1, 1))
             scale_np = np.random.uniform(size=scale_shape).astype(dtype)
             shift_np = np.random.uniform(size=shift_shape).astype(dtype)
             # correctness with scipy
             depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nhwc(
                 input_np, dilated_filter_np, stride=[stride_h, stride_w], padding=padding)
             scale_shift_scipy = np.zeros(shape=scale_shift_shape)
             for c in range(in_channel * channel_multiplier):
                 scale_shift_scipy[:,:,:,c] = depthwise_conv2d_scipy[:,:,:,c] * scale_np[c] + shift_np[c]
                 relu_scipy = np.maximum(scale_shift_scipy, 0)
             return (input_np, filter_np, scale_np, shift_np,
                     depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy)
         # Get the test data
         (input_np, filter_np, scale_np, shift_np,
          depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) = get_ref_data()

         # prepare data
         input_tvm = tvm.nd.array(input_np, ctx)
         filter_tvm = tvm.nd.array(filter_np, ctx)
         scale_tvm = tvm.nd.array(scale_np, ctx)
         shift_tvm = tvm.nd.array(shift_np, ctx)
         depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx)
         scale_shift_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx)
         relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
         # launch kernel 1 (depthwise_conv2d)
         timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1)
         tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
         # launch kernel 2 (depthwise_conv2d + scale_shift)
         timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1)
         tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean
         # launch kernel 3 (depthwise_conv2d + scale_shift + relu)
         timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1)
         tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
         relu_scipy = np.maximum(scale_shift_scipy, 0)
         tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
         tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
         tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)

     for device in get_all_backend():
         with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
             check_device(device)

 def _transform_data(data, bn):
     # NCHW -> NCHW[x]c
     batch_size, channel, height, width = data.shape
     data = np.reshape(data, (batch_size, channel//bn, bn, height, width))
     data = np.transpose(data, (0, 1, 3, 4, 2))
     return data

 def _transform_kernel(kernel, bn):
     # channel, channel_multiplier, kh, kw -> out_channel_chunk, kh, kw, out_channel_block
     channel, channel_multiplier, kh, kw = kernel.shape
     out_channel = channel * channel_multiplier
     kernel = np.reshape(kernel, (out_channel//bn, bn, kh, kw))
     kernel = np.transpose(kernel, (0, 2, 3, 1))
     return kernel

 def depthwise_conv2d_with_workload_NCHWc(batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1):
     in_width = in_height
     filter_channel = in_channel
     filter_width = filter_height
     stride_h = stride_w = stride

     assert dilation == 1, "depthwise_conv2d_NCHWc currently does not support dilation."
     pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width))
     padding_args = (pad_h, pad_w)

     out_channel = filter_channel * channel_multiplier
     # for testing functionality,
     # we choose arbitrary block size that can divide the channel,
     # regardless of the performance.
     oc_block = 1
     for bn in range(16, 0, -1):
         if out_channel % bn == 0:
             oc_block = bn
             break

     ic_block = 1
     for bn in range(oc_block, 0, -1):
         if in_channel % bn == 0:
             ic_block = bn
             break

     # placeholder
     Input = tvm.placeholder((batch, in_channel//ic_block, in_height, in_width, ic_block), name='Input')
     Filter = tvm.placeholder((out_channel//oc_block, filter_height, filter_width, oc_block), name='Filter')
     in_layout = "NCHW%dc" % ic_block
     out_layout = "NCHW%dc" % oc_block
     dtype = 'float32'

     def check_device(device):
         ctx = tvm.context(device, 0)
         if not ctx.exist:
             print("Skip because %s is not enabled" % device)
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             # declare
             DepthwiseConv2d = topi.nn.depthwise_conv2d_NCHWc(Input, Filter,
                                                              (stride_h, stride_w),
                                                              padding_args,
                                                              (dilation, dilation),
                                                              in_layout,
                                                              out_layout, dtype)
             # TODO: add scale_shift implement for NCHWc and add test here
             Relu = topi.nn.relu(DepthwiseConv2d)
             # schedule
             s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
             s2 = topi.generic.schedule_depthwise_conv2d_nchw(Relu)
         # build the kernels
         f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
         f2 = tvm.build(s2, [Input, Filter, Relu], device)

         # Prepare pod type for test data closure
         input_shape = (batch, in_channel, in_height, in_width)
         filter_shape = (filter_channel, channel_multiplier, filter_height, filter_width)

         # Use memoize, pickle the test data for next time use.
         @memoize("topi.tests.test_topi_depthwise_conv2d.NCHWc")
         def get_ref_data():
             input_np = np.random.uniform(size=input_shape).astype(dtype)
             filter_np = np.random.uniform(size=filter_shape).astype(dtype)
             # correctness with scipy
             depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(
                 input_np, filter_np, stride, padding)
             relu_scipy = np.maximum(depthwise_conv2d_scipy, 0)
             return (_transform_data(input_np, ic_block),
                     _transform_kernel(filter_np, oc_block),
                     _transform_data(depthwise_conv2d_scipy, oc_block),
                     _transform_data(relu_scipy, oc_block))

         # Get the test data
         (input_np, filter_np, depthwise_conv2d_scipy, relu_scipy) = get_ref_data()

         input_tvm = tvm.nd.array(input_np, ctx)
         filter_tvm = tvm.nd.array(filter_np, ctx)
         depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),
                                                      dtype=DepthwiseConv2d.dtype), ctx)
         relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
         # launch kernel 1 (depthwise_conv2d)
         f1(input_tvm, filter_tvm, depthwise_conv2d_tvm)
         # launch kernel 2 (depthwise_conv2d + relu)
         f2(input_tvm, filter_tvm, relu_tvm)
         tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
         tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)

     # test llvm only for now since depthwise_conv2d_NCHWc implement is missing in other backend.
     for device in ["llvm"]:
         with autotvm.tophub.context(device):  # load tophub pre-tuned parameters
             check_device(device)


 def test_depthwise_conv2d():
     # mobilenet workloads
     depthwise_conv2d_with_workload_nchw(1, 32, 112, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_nchw(1, 64, 112, 1, 3, 2, "SAME")
     depthwise_conv2d_with_workload_nchw(1, 128, 56, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_nchw(1, 128, 56, 1, 3, 2, "SAME")
     depthwise_conv2d_with_workload_nchw(1, 256, 28, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_nchw(1, 256, 28, 1, 3, 2, "SAME")
     depthwise_conv2d_with_workload_nchw(1, 512, 14, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_nchw(1, 512, 14, 1, 3, 2, "SAME")
     depthwise_conv2d_with_workload_nchw(1, 1024, 7, 1, 3, 1, "SAME")

     # NCHW
     depthwise_conv2d_with_workload_nchw(1, 728, 32, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_nchw(4, 256, 64, 2, 5, 2, "SAME")
     depthwise_conv2d_with_workload_nchw(1, 728, 32, 1, 3, 1, "VALID")
     depthwise_conv2d_with_workload_nchw(4, 256, 64, 2, 5, 2, "VALID")
     # dilation = 2
     depthwise_conv2d_with_workload_nchw(1, 728, 64, 1, 3, 1, "SAME", dilation=2)

     # NHWC
     depthwise_conv2d_with_workload_nhwc(1, 728, 32, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "SAME")
     depthwise_conv2d_with_workload_nhwc(1, 728, 32, 1, 3, 1, "VALID")
     depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "VALID")
     # dilation = 2
     # disabled because it uses too large shared memory on cuda
     # depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME", dilation=2)

     # NCHW[x]c
     depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_NCHWc(4, 256, 64, 2, 5, 2, "SAME")
     depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "VALID")
     depthwise_conv2d_with_workload_NCHWc(4, 256, 64, 2, 5, 2, "VALID")


 if __name__ == "__main__":
     test_depthwise_conv2d()
	import tvm
	from tvm import autotvm
	import topi
	import topi.testing
	import numpy as np
	from topi.util import get_const_tuple
	from topi.nn.util import get_pad_tuple
	from tvm.contrib.pickle_memoize import memoize

	from common import get_all_backend

	def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1):
	in_width = in_height
	filter_channel = in_channel
	filter_width = filter_height
	stride_h = stride_w = stride

	if dilation == 1:
	# here we transform the padding argument from 'str' to 'tuple' ,
	# because we need this to match the "workload" tuple to the records in TopHub
	pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width))
	padding_args = (pad_h, pad_w)
	else:
	padding_args = padding

	# placeholder
	Input = tvm.placeholder((batch, in_channel, in_height, in_width), name='Input')
	Filter = tvm.placeholder((filter_channel, channel_multiplier, filter_height, filter_width), name='Filter')
	Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
	Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')

	dtype = 'float32'

	def check_device(device):
	ctx = tvm.context(device, 0)
	if not ctx.exist:
	print("Skip because %s is not enabled" % device)
	return
	print("Running on target: %s" % device)
	with tvm.target.create(device):
	# declare
	DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, Filter,
	(stride_h, stride_w), padding_args, dilation, dtype)
	ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
	Relu = topi.nn.relu(ScaleShift)
	# schedule
	s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
	s2 = topi.generic.schedule_depthwise_conv2d_nchw(ScaleShift)
	s3 = topi.generic.schedule_depthwise_conv2d_nchw(Relu)
	# build the kernels
	f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
	f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
	f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)

	# Prepare pod type for test data closure
	input_shape = get_const_tuple(Input.shape)
	filter_shape = get_const_tuple(Filter.shape)
	scale_shape = get_const_tuple(Scale.shape)
	shift_shape = get_const_tuple(Shift.shape)
	scale_shift_shape = get_const_tuple(ScaleShift.shape)

	# Use memoize, pickle the test data for next time use.
	@memoize("topi.tests.test_topi_depthwise_conv2d.nchw")
	def get_ref_data():
	input_np = np.random.uniform(size=input_shape).astype(dtype)
	filter_np = np.random.uniform(size=filter_shape).astype(dtype)
	dilated_filter_np = topi.testing.dilate_python(filter_np, (1, 1, dilation, dilation))
	scale_np = np.random.uniform(size=scale_shape).astype(dtype)
	shift_np = np.random.uniform(size=shift_shape).astype(dtype)
	# correctness with scipy
	depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(
	input_np, dilated_filter_np, stride, padding)
	scale_shift_scipy = np.zeros(shape=scale_shift_shape)
	for c in range(in_channel * channel_multiplier):
	scale_shift_scipy[:,c,:,:] = depthwise_conv2d_scipy[:,c,:,:] * scale_np[c] + shift_np[c]
	relu_scipy = np.maximum(scale_shift_scipy, 0)
	return (input_np, filter_np, scale_np, shift_np,
	depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy)
	# Get the test data
	(input_np, filter_np, scale_np, shift_np,
	depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) = get_ref_data()

	input_tvm = tvm.nd.array(input_np, ctx)
	filter_tvm = tvm.nd.array(filter_np, ctx)
	scale_tvm = tvm.nd.array(scale_np, ctx)
	shift_tvm = tvm.nd.array(shift_np, ctx)
	depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx)
	scale_shift_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx)
	relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
	# launch kernel 1 (depthwise_conv2d)
	timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1)
	tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
	# launch kernel 2 (depthwise_conv2d + scale_shift)
	timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1)
	tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean
	# launch kernel 3 (depthwise_conv2d + scale_shift + relu)
	timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1)
	tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
	tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
	tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
	tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)

	for device in get_all_backend():
	with autotvm.tophub.context(device): # load tophub pre-tuned parameters
	check_device(device)


	def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_multiplier, filter_height, stride_h, padding, dilation=1):
	in_width = in_height
	filter_channel = in_channel
	filter_width = filter_height
	stride_w = stride_h

	if dilation == 1:
	# here we transform the padding argument from 'str' to 'tuple' ,
	# because we need this to match the "workload" tuple to the records in TopHub
	pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width))
	padding_args = (pad_h, pad_w)
	else:
	padding_args = padding

	# placeholder
	Input = tvm.placeholder((batch, in_height, in_width, in_channel), name='Input')
	Filter = tvm.placeholder((filter_height, filter_width,filter_channel, channel_multiplier), name='Filter')
	Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
	Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')

	dtype = 'float32'

	def check_device(device):
	ctx = tvm.context(device, 0)
	if not ctx.exist:
	print("Skip because %s is not enabled" % device)
	return
	print("Running on target: %s" % device)

	with tvm.target.create(device):
	# declare
	DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, Filter,
	(stride_h, stride_w), padding_args, dilation, dtype)
	ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift)
	Relu = topi.nn.relu(ScaleShift)
	# schedule
	s1 = topi.generic.schedule_depthwise_conv2d_nhwc(DepthwiseConv2d)
	s2 = topi.generic.schedule_depthwise_conv2d_nhwc(ScaleShift)
	s3 = topi.generic.schedule_depthwise_conv2d_nhwc(Relu)
	# build the kernels
	f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
	f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
	f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)

	# Prepare pod type for test data closure
	input_shape = get_const_tuple(Input.shape)
	filter_shape = get_const_tuple(Filter.shape)
	scale_shape = get_const_tuple(Scale.shape)
	shift_shape = get_const_tuple(Shift.shape)
	scale_shift_shape = get_const_tuple(ScaleShift.shape)

	# Use memoize, pickle the test data for next time use.
	@memoize("topi.tests.test_topi_depthwise_conv2d.nhwc.v2")
	def get_ref_data():
	input_np = np.random.uniform(size=input_shape).astype(dtype)
	filter_np = np.random.uniform(size=filter_shape).astype(dtype)
	dilated_filter_np = topi.testing.dilate_python(filter_np, (dilation, dilation, 1, 1))
	scale_np = np.random.uniform(size=scale_shape).astype(dtype)
	shift_np = np.random.uniform(size=shift_shape).astype(dtype)
	# correctness with scipy
	depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nhwc(
	input_np, dilated_filter_np, stride=[stride_h, stride_w], padding=padding)
	scale_shift_scipy = np.zeros(shape=scale_shift_shape)
	for c in range(in_channel * channel_multiplier):
	scale_shift_scipy[:,:,:,c] = depthwise_conv2d_scipy[:,:,:,c] * scale_np[c] + shift_np[c]
	relu_scipy = np.maximum(scale_shift_scipy, 0)
	return (input_np, filter_np, scale_np, shift_np,
	depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy)
	# Get the test data
	(input_np, filter_np, scale_np, shift_np,
	depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) = get_ref_data()

	# prepare data
	input_tvm = tvm.nd.array(input_np, ctx)
	filter_tvm = tvm.nd.array(filter_np, ctx)
	scale_tvm = tvm.nd.array(scale_np, ctx)
	shift_tvm = tvm.nd.array(shift_np, ctx)
	depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx)
	scale_shift_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx)
	relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
	# launch kernel 1 (depthwise_conv2d)
	timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1)
	tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
	# launch kernel 2 (depthwise_conv2d + scale_shift)
	timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1)
	tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean
	# launch kernel 3 (depthwise_conv2d + scale_shift + relu)
	timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1)
	tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
	relu_scipy = np.maximum(scale_shift_scipy, 0)
	tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
	tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
	tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)

	for device in get_all_backend():
	with autotvm.tophub.context(device): # load tophub pre-tuned parameters
	check_device(device)

	def _transform_data(data, bn):
	# NCHW -> NCHW[x]c
	batch_size, channel, height, width = data.shape
	data = np.reshape(data, (batch_size, channel//bn, bn, height, width))
	data = np.transpose(data, (0, 1, 3, 4, 2))
	return data

	def _transform_kernel(kernel, bn):
	# channel, channel_multiplier, kh, kw -> out_channel_chunk, kh, kw, out_channel_block
	channel, channel_multiplier, kh, kw = kernel.shape
	out_channel = channel * channel_multiplier
	kernel = np.reshape(kernel, (out_channel//bn, bn, kh, kw))
	kernel = np.transpose(kernel, (0, 2, 3, 1))
	return kernel

	def depthwise_conv2d_with_workload_NCHWc(batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1):
	in_width = in_height
	filter_channel = in_channel
	filter_width = filter_height
	stride_h = stride_w = stride

	assert dilation == 1, "depthwise_conv2d_NCHWc currently does not support dilation."
	pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width))
	padding_args = (pad_h, pad_w)

	out_channel = filter_channel * channel_multiplier
	# for testing functionality,
	# we choose arbitrary block size that can divide the channel,
	# regardless of the performance.
	oc_block = 1
	for bn in range(16, 0, -1):
	if out_channel % bn == 0:
	oc_block = bn
	break

	ic_block = 1
	for bn in range(oc_block, 0, -1):
	if in_channel % bn == 0:
	ic_block = bn
	break

	# placeholder
	Input = tvm.placeholder((batch, in_channel//ic_block, in_height, in_width, ic_block), name='Input')
	Filter = tvm.placeholder((out_channel//oc_block, filter_height, filter_width, oc_block), name='Filter')
	in_layout = "NCHW%dc" % ic_block
	out_layout = "NCHW%dc" % oc_block
	dtype = 'float32'

	def check_device(device):
	ctx = tvm.context(device, 0)
	if not ctx.exist:
	print("Skip because %s is not enabled" % device)
	return
	print("Running on target: %s" % device)
	with tvm.target.create(device):
	# declare
	DepthwiseConv2d = topi.nn.depthwise_conv2d_NCHWc(Input, Filter,
	(stride_h, stride_w),
	padding_args,
	(dilation, dilation),
	in_layout,
	out_layout, dtype)
	# TODO: add scale_shift implement for NCHWc and add test here
	Relu = topi.nn.relu(DepthwiseConv2d)
	# schedule
	s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
	s2 = topi.generic.schedule_depthwise_conv2d_nchw(Relu)
	# build the kernels
	f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
	f2 = tvm.build(s2, [Input, Filter, Relu], device)

	# Prepare pod type for test data closure
	input_shape = (batch, in_channel, in_height, in_width)
	filter_shape = (filter_channel, channel_multiplier, filter_height, filter_width)

	# Use memoize, pickle the test data for next time use.
	@memoize("topi.tests.test_topi_depthwise_conv2d.NCHWc")
	def get_ref_data():
	input_np = np.random.uniform(size=input_shape).astype(dtype)
	filter_np = np.random.uniform(size=filter_shape).astype(dtype)
	# correctness with scipy
	depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(
	input_np, filter_np, stride, padding)
	relu_scipy = np.maximum(depthwise_conv2d_scipy, 0)
	return (_transform_data(input_np, ic_block),
	_transform_kernel(filter_np, oc_block),
	_transform_data(depthwise_conv2d_scipy, oc_block),
	_transform_data(relu_scipy, oc_block))

	# Get the test data
	(input_np, filter_np, depthwise_conv2d_scipy, relu_scipy) = get_ref_data()

	input_tvm = tvm.nd.array(input_np, ctx)
	filter_tvm = tvm.nd.array(filter_np, ctx)
	depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),
	dtype=DepthwiseConv2d.dtype), ctx)
	relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
	# launch kernel 1 (depthwise_conv2d)
	f1(input_tvm, filter_tvm, depthwise_conv2d_tvm)
	# launch kernel 2 (depthwise_conv2d + relu)
	f2(input_tvm, filter_tvm, relu_tvm)
	tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
	tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)

	# test llvm only for now since depthwise_conv2d_NCHWc implement is missing in other backend.
	for device in ["llvm"]:
	with autotvm.tophub.context(device): # load tophub pre-tuned parameters
	check_device(device)


	def test_depthwise_conv2d():
	# mobilenet workloads
	depthwise_conv2d_with_workload_nchw(1, 32, 112, 1, 3, 1, "SAME")
	depthwise_conv2d_with_workload_nchw(1, 64, 112, 1, 3, 2, "SAME")
	depthwise_conv2d_with_workload_nchw(1, 128, 56, 1, 3, 1, "SAME")
	depthwise_conv2d_with_workload_nchw(1, 128, 56, 1, 3, 2, "SAME")
	depthwise_conv2d_with_workload_nchw(1, 256, 28, 1, 3, 1, "SAME")
	depthwise_conv2d_with_workload_nchw(1, 256, 28, 1, 3, 2, "SAME")
	depthwise_conv2d_with_workload_nchw(1, 512, 14, 1, 3, 1, "SAME")
	depthwise_conv2d_with_workload_nchw(1, 512, 14, 1, 3, 2, "SAME")
	depthwise_conv2d_with_workload_nchw(1, 1024, 7, 1, 3, 1, "SAME")

	# NCHW
	depthwise_conv2d_with_workload_nchw(1, 728, 32, 1, 3, 1, "SAME")
	depthwise_conv2d_with_workload_nchw(4, 256, 64, 2, 5, 2, "SAME")
	depthwise_conv2d_with_workload_nchw(1, 728, 32, 1, 3, 1, "VALID")
	depthwise_conv2d_with_workload_nchw(4, 256, 64, 2, 5, 2, "VALID")
	# dilation = 2
	depthwise_conv2d_with_workload_nchw(1, 728, 64, 1, 3, 1, "SAME", dilation=2)

	# NHWC
	depthwise_conv2d_with_workload_nhwc(1, 728, 32, 1, 3, 1, "SAME")
	depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "SAME")
	depthwise_conv2d_with_workload_nhwc(1, 728, 32, 1, 3, 1, "VALID")
	depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "VALID")
	# dilation = 2
	# disabled because it uses too large shared memory on cuda
	# depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME", dilation=2)

	# NCHW[x]c
	depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "SAME")
	depthwise_conv2d_with_workload_NCHWc(4, 256, 64, 2, 5, 2, "SAME")
	depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "VALID")
	depthwise_conv2d_with_workload_NCHWc(4, 256, 64, 2, 5, 2, "VALID")


	if __name__ == "__main__":
	test_depthwise_conv2d()