tutorials/autotvm/tune_relay_x86.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 """
 .. _tune_relay_x86:

 Auto-tuning a convolutional network for x86 CPU
 ===============================================
 **Author**: `Yao Wang <https://github.com/kevinthesun>`_, `Eddie Yan <https://github.com/eqy>`_

 This is a tutorial about how to tune convolution neural network
 for x86 CPU.
 """
 import os
 import numpy as np

 import tvm
 from tvm import te
 from tvm import autotvm
 from tvm import relay
 from tvm.relay import testing
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
 from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
 import tvm.contrib.graph_runtime as runtime

 #################################################################
 # Define network
 # --------------
 # First we need to define the network in relay frontend API.
 # We can either load some pre-defined network from :code:`relay.testing`
 # or building :any:`relay.testing.resnet` with relay.
 # We can also load models from MXNet, ONNX and TensorFlow.
 #
 # In this tutorial, we choose resnet-18 as tuning example.


 def get_network(name, batch_size):
     """Get the symbol definition and random weight of a network"""
     input_shape = (batch_size, 3, 224, 224)
     output_shape = (batch_size, 1000)

     if "resnet" in name:
         n_layer = int(name.split("-")[1])
         mod, params = relay.testing.resnet.get_workload(
             num_layers=n_layer, batch_size=batch_size, dtype=dtype
         )
     elif "vgg" in name:
         n_layer = int(name.split("-")[1])
         mod, params = relay.testing.vgg.get_workload(
             num_layers=n_layer, batch_size=batch_size, dtype=dtype
         )
     elif name == "mobilenet":
         mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype)
     elif name == "squeezenet_v1.1":
         mod, params = relay.testing.squeezenet.get_workload(
             batch_size=batch_size, version="1.1", dtype=dtype
         )
     elif name == "inception_v3":
         input_shape = (1, 3, 299, 299)
         mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
     elif name == "mxnet":
         # an example for mxnet model
         from mxnet.gluon.model_zoo.vision import get_model

         block = get_model("resnet18_v1", pretrained=True)
         mod, params = relay.frontend.from_mxnet(block, shape={input_name: input_shape}, dtype=dtype)
         net = mod["main"]
         net = relay.Function(
             net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
         )
         mod = tvm.IRModule.from_expr(net)
     else:
         raise ValueError("Unsupported network: " + name)

     return mod, params, input_shape, output_shape


 # Replace "llvm" with the correct target of your CPU.
 # For example, for AWS EC2 c5 instance with Intel Xeon
 # Platinum 8000 series, the target should be "llvm -mcpu=skylake-avx512".
 # For AWS EC2 c4 instance with Intel Xeon E5-2666 v3, it should be
 # "llvm -mcpu=core-avx2".
 target = "llvm"

 batch_size = 1
 dtype = "float32"
 model_name = "resnet-18"
 log_file = "%s.log" % model_name
 graph_opt_sch_file = "%s_graph_opt.log" % model_name

 # Set the input name of the graph
 # For ONNX models, it is typically "0".
 input_name = "data"

 # Set number of threads used for tuning based on the number of
 # physical CPU cores on your machine.
 num_threads = 1
 os.environ["TVM_NUM_THREADS"] = str(num_threads)


 #################################################################
 # Configure tensor tuning settings and create tasks
 # -------------------------------------------------
 # To get better kernel execution performance on x86 CPU,
 # we need to change data layout of convolution kernel from
 # "NCHW" to "NCHWc". To deal with this situation, we define
 # conv2d_NCHWc operator in topi. We will tune this operator
 # instead of plain conv2d.
 #
 # We will use local mode for tuning configuration. RPC tracker
 # mode can be setup similarly to the approach in
 # :ref:`tune_relay_arm` tutorial.
 #
 # To perform a precise measurement, we should repeat the measurement several
 # times and use the average of results. In addition, we need to flush the cache
 # for the weight tensors between repeated measurements. This can make the measured
 # latency of one operator closer to its actual latency during end-to-end inference.

 tuning_option = {
     "log_filename": log_file,
     "tuner": "random",
     "early_stopping": None,
     "measure_option": autotvm.measure_option(
         builder=autotvm.LocalBuilder(),
         runner=autotvm.LocalRunner(
             number=1, repeat=10, min_repeat_ms=0, enable_cpu_cache_flush=True
         ),
     ),
 }


 # You can skip the implementation of this function for this tutorial.
 def tune_kernels(
     tasks, measure_option, tuner="gridsearch", early_stopping=None, log_filename="tuning.log"
 ):

     for i, task in enumerate(tasks):
         prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))

         # create tuner
         if tuner == "xgb" or tuner == "xgb-rank":
             tuner_obj = XGBTuner(task, loss_type="rank")
         elif tuner == "ga":
             tuner_obj = GATuner(task, pop_size=50)
         elif tuner == "random":
             tuner_obj = RandomTuner(task)
         elif tuner == "gridsearch":
             tuner_obj = GridSearchTuner(task)
         else:
             raise ValueError("Invalid tuner: " + tuner)

         # do tuning
         n_trial = len(task.config_space)
         tuner_obj.tune(
             n_trial=n_trial,
             early_stopping=early_stopping,
             measure_option=measure_option,
             callbacks=[
                 autotvm.callback.progress_bar(n_trial, prefix=prefix),
                 autotvm.callback.log_to_file(log_filename),
             ],
         )


 # Use graph tuner to achieve graph level optimal schedules
 # Set use_DP=False if it takes too long to finish.
 def tune_graph(graph, dshape, records, opt_sch_file, use_DP=True):
     target_op = [
         relay.op.get("nn.conv2d"),
     ]
     Tuner = DPTuner if use_DP else PBQPTuner
     executor = Tuner(graph, {input_name: dshape}, records, target_op, target)
     executor.benchmark_layout_transform(min_exec_num=2000)
     executor.run()
     executor.write_opt_sch2record_file(opt_sch_file)


 ########################################################################
 # Finally, we launch tuning jobs and evaluate the end-to-end performance.


 def tune_and_evaluate(tuning_opt):
     # extract workloads from relay program
     print("Extract tasks...")
     mod, params, data_shape, out_shape = get_network(model_name, batch_size)
     tasks = autotvm.task.extract_from_program(
         mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),)
     )

     # run tuning tasks
     tune_kernels(tasks, **tuning_opt)
     tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file)

     # compile kernels with graph-level best records
     with autotvm.apply_graph_best(graph_opt_sch_file):
         print("Compile...")
         with tvm.transform.PassContext(opt_level=3):
             lib = relay.build_module.build(mod, target=target, params=params)

         # upload parameters to device
         ctx = tvm.cpu()
         data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype))
         module = runtime.GraphModule(lib["default"](ctx))
         module.set_input(input_name, data_tvm)

         # evaluate
         print("Evaluate inference time cost...")
         ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3)
         prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
         print(
             "Mean inference time (std dev): %.2f ms (%.2f ms)"
             % (np.mean(prof_res), np.std(prof_res))
         )


 # We do not run the tuning in our webpage server since it takes too long.
 # Uncomment the following line to run it by yourself.

 # tune_and_evaluate(tuning_option)

 ######################################################################
 # Sample Output
 # -------------
 # The tuning needs to compile many programs and extract feature from them.
 # So a high performance CPU is recommended.
 # One sample output is listed below.
 #
 # .. code-block:: bash
 #
 #    Extract tasks...
 #    Tuning...
 #    [Task  1/12]  Current/Best:  598.05/2497.63 GFLOPS | Progress: (252/252) | 1357.95 s Done.
 #    [Task  2/12]  Current/Best:  522.63/2279.24 GFLOPS | Progress: (784/784) | 3989.60 s Done.
 #    [Task  3/12]  Current/Best:  447.33/1927.69 GFLOPS | Progress: (784/784) | 3869.14 s Done.
 #    [Task  4/12]  Current/Best:  481.11/1912.34 GFLOPS | Progress: (672/672) | 3274.25 s Done.
 #    [Task  5/12]  Current/Best:  414.09/1598.45 GFLOPS | Progress: (672/672) | 2720.78 s Done.
 #    [Task  6/12]  Current/Best:  508.96/2273.20 GFLOPS | Progress: (768/768) | 3718.75 s Done.
 #    [Task  7/12]  Current/Best:  469.14/1955.79 GFLOPS | Progress: (576/576) | 2665.67 s Done.
 #    [Task  8/12]  Current/Best:  230.91/1658.97 GFLOPS | Progress: (576/576) | 2435.01 s Done.
 #    [Task  9/12]  Current/Best:  487.75/2295.19 GFLOPS | Progress: (648/648) | 3009.95 s Done.
 #    [Task 10/12]  Current/Best:  182.33/1734.45 GFLOPS | Progress: (360/360) | 1755.06 s Done.
 #    [Task 11/12]  Current/Best:  372.18/1745.15 GFLOPS | Progress: (360/360) | 1684.50 s Done.
 #    [Task 12/12]  Current/Best:  215.34/2271.11 GFLOPS | Progress: (400/400) | 2128.74 s Done.
 #    Compile...
 #    Evaluate inference time cost...
 #    Mean inference time (std dev): 3.16 ms (0.03 ms)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	"""
	.. _tune_relay_x86:

	Auto-tuning a convolutional network for x86 CPU
	===============================================
	Author: `Yao Wang <https://github.com/kevinthesun>`_, `Eddie Yan <https://github.com/eqy>`_

	This is a tutorial about how to tune convolution neural network
	for x86 CPU.
	"""
	import os
	import numpy as np

	import tvm
	from tvm import te
	from tvm import autotvm
	from tvm import relay
	from tvm.relay import testing
	from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
	from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
	import tvm.contrib.graph_runtime as runtime

	#################################################################
	# Define network
	# --------------
	# First we need to define the network in relay frontend API.
	# We can either load some pre-defined network from :code:`relay.testing`
	# or building :any:`relay.testing.resnet` with relay.
	# We can also load models from MXNet, ONNX and TensorFlow.
	#
	# In this tutorial, we choose resnet-18 as tuning example.


	def get_network(name, batch_size):
	"""Get the symbol definition and random weight of a network"""
	input_shape = (batch_size, 3, 224, 224)
	output_shape = (batch_size, 1000)

	if "resnet" in name:
	n_layer = int(name.split("-")[1])
	mod, params = relay.testing.resnet.get_workload(
	num_layers=n_layer, batch_size=batch_size, dtype=dtype
	)
	elif "vgg" in name:
	n_layer = int(name.split("-")[1])
	mod, params = relay.testing.vgg.get_workload(
	num_layers=n_layer, batch_size=batch_size, dtype=dtype
	)
	elif name == "mobilenet":
	mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype)
	elif name == "squeezenet_v1.1":
	mod, params = relay.testing.squeezenet.get_workload(
	batch_size=batch_size, version="1.1", dtype=dtype
	)
	elif name == "inception_v3":
	input_shape = (1, 3, 299, 299)
	mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
	elif name == "mxnet":
	# an example for mxnet model
	from mxnet.gluon.model_zoo.vision import get_model

	block = get_model("resnet18_v1", pretrained=True)
	mod, params = relay.frontend.from_mxnet(block, shape={input_name: input_shape}, dtype=dtype)
	net = mod["main"]
	net = relay.Function(
	net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
	)
	mod = tvm.IRModule.from_expr(net)
	else:
	raise ValueError("Unsupported network: " + name)

	return mod, params, input_shape, output_shape


	# Replace "llvm" with the correct target of your CPU.
	# For example, for AWS EC2 c5 instance with Intel Xeon
	# Platinum 8000 series, the target should be "llvm -mcpu=skylake-avx512".
	# For AWS EC2 c4 instance with Intel Xeon E5-2666 v3, it should be
	# "llvm -mcpu=core-avx2".
	target = "llvm"

	batch_size = 1
	dtype = "float32"
	model_name = "resnet-18"
	log_file = "%s.log" % model_name
	graph_opt_sch_file = "%s_graph_opt.log" % model_name

	# Set the input name of the graph
	# For ONNX models, it is typically "0".
	input_name = "data"

	# Set number of threads used for tuning based on the number of
	# physical CPU cores on your machine.
	num_threads = 1
	os.environ["TVM_NUM_THREADS"] = str(num_threads)


	#################################################################
	# Configure tensor tuning settings and create tasks
	# -------------------------------------------------
	# To get better kernel execution performance on x86 CPU,
	# we need to change data layout of convolution kernel from
	# "NCHW" to "NCHWc". To deal with this situation, we define
	# conv2d_NCHWc operator in topi. We will tune this operator
	# instead of plain conv2d.
	#
	# We will use local mode for tuning configuration. RPC tracker
	# mode can be setup similarly to the approach in
	# :ref:`tune_relay_arm` tutorial.
	#
	# To perform a precise measurement, we should repeat the measurement several
	# times and use the average of results. In addition, we need to flush the cache
	# for the weight tensors between repeated measurements. This can make the measured
	# latency of one operator closer to its actual latency during end-to-end inference.

	tuning_option = {
	"log_filename": log_file,
	"tuner": "random",
	"early_stopping": None,
	"measure_option": autotvm.measure_option(
	builder=autotvm.LocalBuilder(),
	runner=autotvm.LocalRunner(
	number=1, repeat=10, min_repeat_ms=0, enable_cpu_cache_flush=True
	),
	),
	}


	# You can skip the implementation of this function for this tutorial.
	def tune_kernels(
	tasks, measure_option, tuner="gridsearch", early_stopping=None, log_filename="tuning.log"
	):

	for i, task in enumerate(tasks):
	prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))

	# create tuner
	if tuner == "xgb" or tuner == "xgb-rank":
	tuner_obj = XGBTuner(task, loss_type="rank")
	elif tuner == "ga":
	tuner_obj = GATuner(task, pop_size=50)
	elif tuner == "random":
	tuner_obj = RandomTuner(task)
	elif tuner == "gridsearch":
	tuner_obj = GridSearchTuner(task)
	else:
	raise ValueError("Invalid tuner: " + tuner)

	# do tuning
	n_trial = len(task.config_space)
	tuner_obj.tune(
	n_trial=n_trial,
	early_stopping=early_stopping,
	measure_option=measure_option,
	callbacks=[
	autotvm.callback.progress_bar(n_trial, prefix=prefix),
	autotvm.callback.log_to_file(log_filename),
	],
	)


	# Use graph tuner to achieve graph level optimal schedules
	# Set use_DP=False if it takes too long to finish.
	def tune_graph(graph, dshape, records, opt_sch_file, use_DP=True):
	target_op = [
	relay.op.get("nn.conv2d"),
	]
	Tuner = DPTuner if use_DP else PBQPTuner
	executor = Tuner(graph, {input_name: dshape}, records, target_op, target)
	executor.benchmark_layout_transform(min_exec_num=2000)
	executor.run()
	executor.write_opt_sch2record_file(opt_sch_file)


	########################################################################
	# Finally, we launch tuning jobs and evaluate the end-to-end performance.


	def tune_and_evaluate(tuning_opt):
	# extract workloads from relay program
	print("Extract tasks...")
	mod, params, data_shape, out_shape = get_network(model_name, batch_size)
	tasks = autotvm.task.extract_from_program(
	mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),)
	)

	# run tuning tasks
	tune_kernels(tasks, **tuning_opt)
	tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file)

	# compile kernels with graph-level best records
	with autotvm.apply_graph_best(graph_opt_sch_file):
	print("Compile...")
	with tvm.transform.PassContext(opt_level=3):
	lib = relay.build_module.build(mod, target=target, params=params)

	# upload parameters to device
	ctx = tvm.cpu()
	data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype))
	module = runtime.GraphModule(lib["default"](ctx))
	module.set_input(input_name, data_tvm)

	# evaluate
	print("Evaluate inference time cost...")
	ftimer = module.module.time_evaluator("run", ctx, number=100, repeat=3)
	prof_res = np.array(ftimer().results) * 1000 # convert to millisecond
	print(
	"Mean inference time (std dev): %.2f ms (%.2f ms)"
	% (np.mean(prof_res), np.std(prof_res))
	)


	# We do not run the tuning in our webpage server since it takes too long.
	# Uncomment the following line to run it by yourself.

	# tune_and_evaluate(tuning_option)

	######################################################################
	# Sample Output
	# -------------
	# The tuning needs to compile many programs and extract feature from them.
	# So a high performance CPU is recommended.
	# One sample output is listed below.
	#
	# .. code-block:: bash
	#
	# Extract tasks...
	# Tuning...
	# [Task 1/12] Current/Best: 598.05/2497.63 GFLOPS \| Progress: (252/252) \| 1357.95 s Done.
	# [Task 2/12] Current/Best: 522.63/2279.24 GFLOPS \| Progress: (784/784) \| 3989.60 s Done.
	# [Task 3/12] Current/Best: 447.33/1927.69 GFLOPS \| Progress: (784/784) \| 3869.14 s Done.
	# [Task 4/12] Current/Best: 481.11/1912.34 GFLOPS \| Progress: (672/672) \| 3274.25 s Done.
	# [Task 5/12] Current/Best: 414.09/1598.45 GFLOPS \| Progress: (672/672) \| 2720.78 s Done.
	# [Task 6/12] Current/Best: 508.96/2273.20 GFLOPS \| Progress: (768/768) \| 3718.75 s Done.
	# [Task 7/12] Current/Best: 469.14/1955.79 GFLOPS \| Progress: (576/576) \| 2665.67 s Done.
	# [Task 8/12] Current/Best: 230.91/1658.97 GFLOPS \| Progress: (576/576) \| 2435.01 s Done.
	# [Task 9/12] Current/Best: 487.75/2295.19 GFLOPS \| Progress: (648/648) \| 3009.95 s Done.
	# [Task 10/12] Current/Best: 182.33/1734.45 GFLOPS \| Progress: (360/360) \| 1755.06 s Done.
	# [Task 11/12] Current/Best: 372.18/1745.15 GFLOPS \| Progress: (360/360) \| 1684.50 s Done.
	# [Task 12/12] Current/Best: 215.34/2271.11 GFLOPS \| Progress: (400/400) \| 2128.74 s Done.
	# Compile...
	# Evaluate inference time cost...
	# Mean inference time (std dev): 3.16 ms (0.03 ms)