vta/tutorials/autotvm/tune_alu_vta.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 """
 Auto-tuning a ALU fused op on VTA
 """

 import os
 from mxnet.gluon.model_zoo import vision
 import numpy as np
 from PIL import Image

 from tvm import topi
 import tvm
 from tvm import te
 from tvm import rpc, autotvm, relay
 from tvm.contrib import graph_runtime, download
 from tvm.autotvm.measure.measure_methods import request_remote
 from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
 from tvm.autotvm import record

 import vta
 from vta.testing import simulator
 from vta.top import graph_pack
 import copy


 #################################################################
 # Compile network
 # ---------------
 # Perform vta-specific compilation with Relay from a Gluon model
 def compile_network(env, target, model, start_pack, stop_pack):

     # Populate the shape and data type dictionary
     dtype_dict = {"data": "float32"}
     shape_dict = {"data": (env.BATCH, 3, 224, 224)}

     # Get off the shelf gluon model, and convert to relay
     gluon_model = vision.get_model(model, pretrained=True)
     mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)

     # Update shape and type dictionary
     shape_dict.update({k: v.shape for k, v in params.items()})
     dtype_dict.update({k: str(v.dtype) for k, v in params.items()})

     # Perform quantization in Relay
     # Note: We set opt_level to 3 in order to fold batch norm
     with relay.build_config(opt_level=3):
         with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]):
             mod = relay.quantize.quantize(mod, params=params)

     # Perform graph packing and constant folding for VTA target
     if target.device_name == "vta":
         assert env.BLOCK_IN == env.BLOCK_OUT
         relay_prog = graph_pack(
             mod["main"],
             env.BATCH,
             env.BLOCK_OUT,
             env.WGT_WIDTH,
             start_name=start_pack,
             stop_name=stop_pack,
         )

     return relay_prog, params


 ###########################################
 # Set Tuning Options
 # ------------------
 # Before tuning, we should apply some configurations.
 # Here we use an Pynq-Z1 board as an example.

 # Tracker host and port can be set by your environment
 tracker_host = os.environ.get("TVM_TRACKER_HOST", "0.0.0.0")
 tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190))

 # Load VTA parameters from the vta/config/vta_config.json file
 env = vta.get_env()

 # This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
 # Set ``device=arm_cpu`` to run inference on the CPU
 # or ``device=vta`` to run inference on the FPGA.
 device = "vta"
 target = env.target if device == "vta" else env.target_vta_cpu

 # Name of Gluon model to compile
 # The ``start_pack`` and ``stop_pack`` labels indicate where
 # to start and end the graph packing relay pass: in other words
 # where to start and finish offloading to VTA.
 network = "resnet50_v2"
 start_pack = "nn.max_pool2d"
 stop_pack = "nn.global_avg_pool2d"

 # Tuning option
 log_file = "%s.alu.%s.log" % (device, network)
 tuning_option = {
     "log_filename": log_file,
     "tuner": "random",
     "n_trial": 1000,
     "early_stopping": None,
     "measure_option": autotvm.measure_option(
         builder=autotvm.LocalBuilder(n_parallel=1),
         runner=autotvm.RPCRunner(
             env.TARGET,
             host=tracker_host,
             port=tracker_port,
             number=5,
             timeout=60,
             # check_correctness=True, # TODO: re-enable when check_correctness works again.
         ),
     ),
 }


 def log_to_file(file_out, protocol="json"):
     """Log the tuning records into file.
     The rows of the log are stored in the format of autotvm.record.encode.
     for lhs == rhs, we add an extra rhs = [] record

     Parameters
     ----------
     file_out : str
         The file to log to.
     protocol: str, optional
         The log protocol. Can be 'json' or 'pickle'

     Returns
     -------
     callback : callable
         Callback function to do the logging.
     """

     def _callback(_, inputs, results):
         with open(file_out, "a") as f:
             for inp, result in zip(inputs, results):
                 f.write(record.encode(inp, result, protocol) + "\n")

                 # we only consider task with same lhs and rhs
                 if inp.task.args[0] == inp.task.args[1]:
                     args = list(inp.task.args)
                     args[1] = (args[0][0], (), args[0][2])
                     inp_copy = copy.deepcopy(inp)
                     inp_copy.task.args = tuple(args)
                     f.write(record.encode(inp_copy, result, protocol) + "\n")

     return _callback


 def tune_tasks(
     tasks,
     measure_option,
     tuner="xgb",
     n_trial=10,
     early_stopping=None,
     log_filename="tuning.log",
     use_transfer_learning=True,
 ):

     # create tmp log file
     tmp_log_file = log_filename + ".tmp"
     if os.path.exists(tmp_log_file):
         os.remove(tmp_log_file)

     for i, tsk in enumerate(reversed(tasks)):
         prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))

         # create tuner
         if tuner == "xgb" or tuner == "xgb-rank":
             tuner_obj = XGBTuner(tsk, loss_type="rank")
         elif tuner == "xgb_knob":
             tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob")
         elif tuner == "ga":
             tuner_obj = GATuner(tsk, pop_size=50)
         elif tuner == "random":
             tuner_obj = RandomTuner(tsk)
         elif tuner == "gridsearch":
             tuner_obj = GridSearchTuner(tsk)
         else:
             raise ValueError("Invalid tuner: " + tuner)

         if use_transfer_learning:
             if os.path.isfile(tmp_log_file):
                 tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))

         # do tuning
         tsk_trial = min(n_trial, len(tsk.config_space))
         tuner_obj.tune(
             n_trial=tsk_trial,
             early_stopping=early_stopping,
             measure_option=measure_option,
             callbacks=[
                 autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
                 log_to_file(tmp_log_file),
             ],
         )

     # pick best records to a cache file
     autotvm.record.pick_best(tmp_log_file, log_filename)
     os.remove(tmp_log_file)


 ########################################################################
 # Register VTA-specific tuning tasks
 def register_vta_tuning_tasks():
     from tvm.autotvm.task import TaskExtractEnv

     @tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
     def my_clip(x, a_min, a_max):
         """Unlike topi's current clip, put min and max into two stages."""
         const_min = tvm.tir.const(a_min, x.dtype)
         const_max = tvm.tir.const(a_max, x.dtype)
         x = te.compute(x.shape, lambda *i: tvm.te.min(x(*i), const_max), name="clipA")
         x = te.compute(x.shape, lambda *i: tvm.te.max(x(*i), const_min), name="clipB")
         return x

     # init autotvm env to register VTA operator
     TaskExtractEnv()

     @autotvm.template("add.vta")
     def _topi_add(*args, **kwargs):
         assert not kwargs, "Do not support kwargs in template function call"
         A, B = args[:2]

         with tvm.target.vta():
             res = vta.top.op.add_packed(*args, **kwargs)
             res = my_clip(res, 0, 127)
             res = topi.cast(res, "int8")

         if tvm.target.Target.current().device_name == "vta":
             s = vta.top.op.schedule_add_packed([res])
         else:
             s = te.create_schedule([res.op])
         return s, [A, B, res]

     @autotvm.template("multiply.vta")
     def _topi_multiply(*args, **kwargs):
         assert not kwargs, "Do not support kwargs in template function call"
         A, B = args[:2]

         with tvm.target.vta():
             res = vta.top.op.multiply_packed(*args, **kwargs)
             res = my_clip(res, 0, 127)
             res = topi.cast(res, "int8")

         if tvm.target.Target.current().device_name == "vta":
             s = vta.top.op.schedule_multiply_packed([res])
         else:
             s = te.create_schedule([res.op])
         return s, [A, B, res]


 ########################################################################
 # Finally, we launch tuning jobs and evaluate the end-to-end performance.
 def tune_and_evaluate(tuning_opt):

     if env.TARGET != "intelfocl":
         print("ALU only op only available for intelfocl target")
         return

     # Register VTA tuning tasks
     register_vta_tuning_tasks()

     # Perform task extraction on Relay program
     print("Extract tasks...")
     relay_prog, params = compile_network(env, target, network, start_pack, stop_pack)
     mod = tvm.IRModule.from_expr(relay_prog)
     tasks = autotvm.task.extract_from_program(
         mod,
         params=params,
         ops=(
             relay.op.get("add"),
             relay.op.get("multiply"),
         ),
         target=target,
         target_host=env.target_host,
     )

     # filter out non-packed alu task
     tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks))
     # filter out float alu task
     tasks = list(filter(lambda t: t.args[0][2] != "float32", tasks))

     # We should have extracted 10 convolution tasks
     tasks_set = {}
     print("Extracted {} alu tasks:".format(len(tasks)))
     for tsk in tasks:
         print("tsk = ", tsk)

         if len(tsk.args[1][1]) == 0:
             args = list(tsk.args)
             args[1] = args[0]
             tsk.args = tuple(args)

         if (tsk.name, tsk.args) in tasks_set:
             print("task {} already exists".format(tsk))
         tasks_set[(tsk.name, tsk.args)] = tsk

     tasks = list(tasks_set.values())
     print("After merged, final #tasks={}, tasks = {}".format(len(tasks), tasks))

     # run tuning tasks
     print("Tuning...")
     tune_tasks(tasks, **tuning_opt)


 # Run the tuning and evaluate the results
 tune_and_evaluate(tuning_option)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	"""
	Auto-tuning a ALU fused op on VTA
	"""

	import os
	from mxnet.gluon.model_zoo import vision
	import numpy as np
	from PIL import Image

	from tvm import topi
	import tvm
	from tvm import te
	from tvm import rpc, autotvm, relay
	from tvm.contrib import graph_runtime, download
	from tvm.autotvm.measure.measure_methods import request_remote
	from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
	from tvm.autotvm import record

	import vta
	from vta.testing import simulator
	from vta.top import graph_pack
	import copy


	#################################################################
	# Compile network
	# ---------------
	# Perform vta-specific compilation with Relay from a Gluon model
	def compile_network(env, target, model, start_pack, stop_pack):

	# Populate the shape and data type dictionary
	dtype_dict = {"data": "float32"}
	shape_dict = {"data": (env.BATCH, 3, 224, 224)}

	# Get off the shelf gluon model, and convert to relay
	gluon_model = vision.get_model(model, pretrained=True)
	mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)

	# Update shape and type dictionary
	shape_dict.update({k: v.shape for k, v in params.items()})
	dtype_dict.update({k: str(v.dtype) for k, v in params.items()})

	# Perform quantization in Relay
	# Note: We set opt_level to 3 in order to fold batch norm
	with relay.build_config(opt_level=3):
	with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]):
	mod = relay.quantize.quantize(mod, params=params)

	# Perform graph packing and constant folding for VTA target
	if target.device_name == "vta":
	assert env.BLOCK_IN == env.BLOCK_OUT
	relay_prog = graph_pack(
	mod["main"],
	env.BATCH,
	env.BLOCK_OUT,
	env.WGT_WIDTH,
	start_name=start_pack,
	stop_name=stop_pack,
	)

	return relay_prog, params


	###########################################
	# Set Tuning Options
	# ------------------
	# Before tuning, we should apply some configurations.
	# Here we use an Pynq-Z1 board as an example.

	# Tracker host and port can be set by your environment
	tracker_host = os.environ.get("TVM_TRACKER_HOST", "0.0.0.0")
	tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190))

	# Load VTA parameters from the vta/config/vta_config.json file
	env = vta.get_env()

	# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device.
	# Set ``device=arm_cpu`` to run inference on the CPU
	# or ``device=vta`` to run inference on the FPGA.
	device = "vta"
	target = env.target if device == "vta" else env.target_vta_cpu

	# Name of Gluon model to compile
	# The ``start_pack`` and ``stop_pack`` labels indicate where
	# to start and end the graph packing relay pass: in other words
	# where to start and finish offloading to VTA.
	network = "resnet50_v2"
	start_pack = "nn.max_pool2d"
	stop_pack = "nn.global_avg_pool2d"

	# Tuning option
	log_file = "%s.alu.%s.log" % (device, network)
	tuning_option = {
	"log_filename": log_file,
	"tuner": "random",
	"n_trial": 1000,
	"early_stopping": None,
	"measure_option": autotvm.measure_option(
	builder=autotvm.LocalBuilder(n_parallel=1),
	runner=autotvm.RPCRunner(
	env.TARGET,
	host=tracker_host,
	port=tracker_port,
	number=5,
	timeout=60,
	# check_correctness=True, # TODO: re-enable when check_correctness works again.
	),
	),
	}


	def log_to_file(file_out, protocol="json"):
	"""Log the tuning records into file.
	The rows of the log are stored in the format of autotvm.record.encode.
	for lhs == rhs, we add an extra rhs = [] record

	Parameters
	----------
	file_out : str
	The file to log to.
	protocol: str, optional
	The log protocol. Can be 'json' or 'pickle'

	Returns
	-------
	callback : callable
	Callback function to do the logging.
	"""

	def _callback(_, inputs, results):
	with open(file_out, "a") as f:
	for inp, result in zip(inputs, results):
	f.write(record.encode(inp, result, protocol) + "\n")

	# we only consider task with same lhs and rhs
	if inp.task.args[0] == inp.task.args[1]:
	args = list(inp.task.args)
	args[1] = (args[0][0], (), args[0][2])
	inp_copy = copy.deepcopy(inp)
	inp_copy.task.args = tuple(args)
	f.write(record.encode(inp_copy, result, protocol) + "\n")

	return _callback


	def tune_tasks(
	tasks,
	measure_option,
	tuner="xgb",
	n_trial=10,
	early_stopping=None,
	log_filename="tuning.log",
	use_transfer_learning=True,
	):

	# create tmp log file
	tmp_log_file = log_filename + ".tmp"
	if os.path.exists(tmp_log_file):
	os.remove(tmp_log_file)

	for i, tsk in enumerate(reversed(tasks)):
	prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))

	# create tuner
	if tuner == "xgb" or tuner == "xgb-rank":
	tuner_obj = XGBTuner(tsk, loss_type="rank")
	elif tuner == "xgb_knob":
	tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob")
	elif tuner == "ga":
	tuner_obj = GATuner(tsk, pop_size=50)
	elif tuner == "random":
	tuner_obj = RandomTuner(tsk)
	elif tuner == "gridsearch":
	tuner_obj = GridSearchTuner(tsk)
	else:
	raise ValueError("Invalid tuner: " + tuner)

	if use_transfer_learning:
	if os.path.isfile(tmp_log_file):
	tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))

	# do tuning
	tsk_trial = min(n_trial, len(tsk.config_space))
	tuner_obj.tune(
	n_trial=tsk_trial,
	early_stopping=early_stopping,
	measure_option=measure_option,
	callbacks=[
	autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
	log_to_file(tmp_log_file),
	],
	)

	# pick best records to a cache file
	autotvm.record.pick_best(tmp_log_file, log_filename)
	os.remove(tmp_log_file)


	########################################################################
	# Register VTA-specific tuning tasks
	def register_vta_tuning_tasks():
	from tvm.autotvm.task import TaskExtractEnv

	@tvm.te.tag_scope(tag=topi.tag.ELEMWISE)
	def my_clip(x, a_min, a_max):
	"""Unlike topi's current clip, put min and max into two stages."""
	const_min = tvm.tir.const(a_min, x.dtype)
	const_max = tvm.tir.const(a_max, x.dtype)
	x = te.compute(x.shape, lambda i: tvm.te.min(x(i), const_max), name="clipA")
	x = te.compute(x.shape, lambda i: tvm.te.max(x(i), const_min), name="clipB")
	return x

	# init autotvm env to register VTA operator
	TaskExtractEnv()

	@autotvm.template("add.vta")
	def _topi_add(args, *kwargs):
	assert not kwargs, "Do not support kwargs in template function call"
	A, B = args[:2]

	with tvm.target.vta():
	res = vta.top.op.add_packed(args, *kwargs)
	res = my_clip(res, 0, 127)
	res = topi.cast(res, "int8")

	if tvm.target.Target.current().device_name == "vta":
	s = vta.top.op.schedule_add_packed([res])
	else:
	s = te.create_schedule([res.op])
	return s, [A, B, res]

	@autotvm.template("multiply.vta")
	def _topi_multiply(args, *kwargs):
	assert not kwargs, "Do not support kwargs in template function call"
	A, B = args[:2]

	with tvm.target.vta():
	res = vta.top.op.multiply_packed(args, *kwargs)
	res = my_clip(res, 0, 127)
	res = topi.cast(res, "int8")

	if tvm.target.Target.current().device_name == "vta":
	s = vta.top.op.schedule_multiply_packed([res])
	else:
	s = te.create_schedule([res.op])
	return s, [A, B, res]


	########################################################################
	# Finally, we launch tuning jobs and evaluate the end-to-end performance.
	def tune_and_evaluate(tuning_opt):

	if env.TARGET != "intelfocl":
	print("ALU only op only available for intelfocl target")
	return

	# Register VTA tuning tasks
	register_vta_tuning_tasks()

	# Perform task extraction on Relay program
	print("Extract tasks...")
	relay_prog, params = compile_network(env, target, network, start_pack, stop_pack)
	mod = tvm.IRModule.from_expr(relay_prog)
	tasks = autotvm.task.extract_from_program(
	mod,
	params=params,
	ops=(
	relay.op.get("add"),
	relay.op.get("multiply"),
	),
	target=target,
	target_host=env.target_host,
	)

	# filter out non-packed alu task
	tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks))
	# filter out float alu task
	tasks = list(filter(lambda t: t.args[0][2] != "float32", tasks))

	# We should have extracted 10 convolution tasks
	tasks_set = {}
	print("Extracted {} alu tasks:".format(len(tasks)))
	for tsk in tasks:
	print("tsk = ", tsk)

	if len(tsk.args[1][1]) == 0:
	args = list(tsk.args)
	args[1] = args[0]
	tsk.args = tuple(args)

	if (tsk.name, tsk.args) in tasks_set:
	print("task {} already exists".format(tsk))
	tasks_set[(tsk.name, tsk.args)] = tsk

	tasks = list(tasks_set.values())
	print("After merged, final #tasks={}, tasks = {}".format(len(tasks), tasks))

	# run tuning tasks
	print("Tuning...")
	tune_tasks(tasks, **tuning_opt)


	# Run the tuning and evaluate the results
	tune_and_evaluate(tuning_option)