tutorials/autotvm/tune_simple_template.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 """
 Writing tunable template and Using auto-tuner
 =============================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_

 This is an introduction tutorial to the auto-tuning module in TVM.

 There are two steps in auto-tuning.
 The first step is defining a search space.
 The second step is running a search algorithm to explore through this space.
 In this tutorial, you can learn how to perform these two steps in TVM.
 The whole workflow is illustrated by a matrix multiplication example.
 """

 ######################################################################
 # Install dependencies
 # --------------------
 # To use autotvm package in TVM, we need to install some extra dependencies.
 # This step (installing xgboost) can be skipped as it doesn't need XGBoost
 # (change "3" to "2" if you use python2):
 #
 # .. code-block:: bash
 #
 #   pip3 install --user psutil xgboost
 #
 # To make TVM run faster in tuning, it is recommended to use cython
 # as FFI of TVM. In the root directory of TVM, execute
 # (change "3" to "2" if you use python2):
 #
 # .. code-block:: bash
 #
 #   pip3 install --user cython
 #   sudo make cython3
 #
 # Now return to python code. Import packages.

 import logging
 import sys

 import numpy as np
 import tvm
 from tvm import te

 # the module is called `autotvm`
 from tvm import autotvm

 ######################################################################
 # Step 1:  Define the search space
 # --------------------------------
 # In this section, we will rewrite a deterministic TVM schedule code to a
 # tunable schedule template. You can regard the process of search space definition
 # as the parameterization of our existing schedule code.
 #
 # To begin with, here is how we implement a blocked matrix multiplication in TVM.

 # Matmul V0: Constant tiling factor
 def matmul_v0(N, L, M, dtype):
     A = te.placeholder((N, L), name="A", dtype=dtype)
     B = te.placeholder((L, M), name="B", dtype=dtype)

     k = te.reduce_axis((0, L), name="k")
     C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
     s = te.create_schedule(C.op)

     # schedule
     y, x = s[C].op.axis
     k = s[C].op.reduce_axis[0]

     yo, yi = s[C].split(y, 8)
     xo, xi = s[C].split(x, 8)

     s[C].reorder(yo, xo, k, yi, xi)

     return s, [A, B, C]


 #####################################################################
 # Parametrize the schedule
 # ^^^^^^^^^^^^^^^^^^^^^^^^
 # In the previous schedule code, we use a constant "8" as tiling factor.
 # However, it might not be the best one because the best tiling factor depends
 # on real hardware environment and input shape.
 #
 # If you want the schedule code to be portable across a wider range of input shapes
 # and target hardware, it is better to define a set of candidate values and
 # pick the best one according to the measurement results on target hardware.
 #
 # In autotvm, we can define a tunable parameter, or a "knob" for such kind of value.

 # Matmul V1: List candidate values
 @autotvm.template("tutorial/matmul_v1")  # 1. use a decorator
 def matmul_v1(N, L, M, dtype):
     A = te.placeholder((N, L), name="A", dtype=dtype)
     B = te.placeholder((L, M), name="B", dtype=dtype)

     k = te.reduce_axis((0, L), name="k")
     C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
     s = te.create_schedule(C.op)

     # schedule
     y, x = s[C].op.axis
     k = s[C].op.reduce_axis[0]

     # 2. get the config object
     cfg = autotvm.get_config()

     # 3. define search space
     cfg.define_knob("tile_y", [1, 2, 4, 8, 16])
     cfg.define_knob("tile_x", [1, 2, 4, 8, 16])

     # 4. schedule according to config
     yo, yi = s[C].split(y, cfg["tile_y"].val)
     xo, xi = s[C].split(x, cfg["tile_x"].val)

     s[C].reorder(yo, xo, k, yi, xi)

     return s, [A, B, C]


 ###############################################################################
 # Here we make four modifications to the previous schedule code and get
 # a tunable "template". We can explain the modifications one by one.
 #
 # 1. Use a decorator to mark this function as a simple template.
 # 2. Get a config object:
 #    You can regard this :code:`cfg` as an argument of this function but
 #    we obtain it in a different way. With this argument, this function is no longer
 #    a deterministic schedule code. Instead, we can pass different configurations to
 #    this function and get different schedules, so this function is a "template".
 #
 #    To make the template function more compact, we do two things in a single function.
 #    (1) define a search space and (2) schedule according to an entity in this space.
 #    To achieve this, we make :code:`cfg` be either
 #    a :any:`ConfigSpace` or a :any:`ConfigEntity` object.
 #
 #    When it is a :any:`ConfigSpace`, it will collect all tunable knobs in this function and
 #    build the search space.
 #    When it is a :any:`ConfigEntity`, it will ignore all space definition API
 #    (namely, :code:`cfg.define_XXXXX(...)`).   Instead, it stores deterministic values for
 #    all tunable knobs, and we schedule according to these values.
 #
 #    During auto-tuning, we will first call this template with a :any:`ConfigSpace`
 #    object to build the search space. Then we call this template with different :any:`ConfigEntity`
 #    in the built space to get different schedules. Finally we will measure the code generated by
 #    different schedules and pick the best one.
 #
 # 3. Define two tunable knobs. The first one is :code:`tile_y` with
 #    5 possible values. The second one is :code:`tile_x` with a same
 #    list of possible values. These two knobs are independent, so they
 #    span a search space with size = 5x5 = 25
 # 4. Schedule according to the deterministic values in :code:`cfg`
 #

 #####################################################################
 # Use better space definition API
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 # In the previous template, we manually list all possible values for a knob.
 # This is the lowest level API to define the space.
 # However, we also provide another set of API to make the space definition
 # easier and smarter. It is recommended to use this set of high level API.
 #
 # In the following example, we use :any:`ConfigSpace.define_split` to define a split
 # knob. It will enumerate all the possible ways to split an axis and construct
 # the space.
 #
 # We also have :any:`ConfigSpace.define_reorder` for reorder knob and
 # :any:`ConfigSpace.define_annotate` for annotation like unroll, vectorization,
 # thread binding.
 # When the high level API cannot meet your requirement, you can always fall
 # back to use low level API.


 @autotvm.template("tutorial/matmul")
 def matmul(N, L, M, dtype):
     A = te.placeholder((N, L), name="A", dtype=dtype)
     B = te.placeholder((L, M), name="B", dtype=dtype)

     k = te.reduce_axis((0, L), name="k")
     C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
     s = te.create_schedule(C.op)

     # schedule
     y, x = s[C].op.axis
     k = s[C].op.reduce_axis[0]

     ##### define space begin #####
     cfg = autotvm.get_config()
     cfg.define_split("tile_y", y, num_outputs=2)
     cfg.define_split("tile_x", x, num_outputs=2)
     ##### define space end #####

     # schedule according to config
     yo, yi = cfg["tile_y"].apply(s, C, y)
     xo, xi = cfg["tile_x"].apply(s, C, x)

     s[C].reorder(yo, xo, k, yi, xi)

     return s, [A, B, C]


 ######################################################################
 # .. note:: More Explanation on :code:`cfg.defile_split`
 #
 #  In this template, :code:`cfg.define_split("tile_y", y, num_outputs=2)` will enumerate
 #  all possible combinations that can split axis y into two axes with factors of the length of y.
 #  For example, if the length of y is 32 and we want to split it into two axes
 #  using factors of 32, then there are 6 possible values for
 #  (length of outer axis, length of inner axis) pair, namely
 #  (32, 1), (16, 2), (8, 4), (4, 8), (2, 16) or (1, 32).
 #  They are just the 6 possible values of `tile_y`.
 #
 #  During schedule, :code:`cfg["tile_y"]` is a :code:`SplitEntity` object.
 #  We stores the lengths of outer axes and inner axes in :code:`cfg['tile_y'].size`
 #  (a tuple with two elements).
 #  In this template, we apply it by using :code:`yo, yi = cfg['tile_y'].apply(s, C, y)`.
 #  Actually, this is equivalent to
 #  :code:`yo, yi = s[C].split(y, cfg["tile_y"].size[1])`
 #  or  :code:`yo, yi = s[C].split(y, nparts=cfg['tile_y"].size[0])`
 #
 #  The advantage of using cfg.apply API is that it makes multi-level split
 #  (when num_outputs >= 3) easier.

 ######################################################################
 # Step 2:  Search through the space
 # ---------------------------------
 # In step 1, we build the search space by extending our old schedule code
 # into a template. The next step is to pick a tuner and explore in this space.
 #
 # Auto-tuners in TVM
 # ^^^^^^^^^^^^^^^^^^
 # The job for a tuner can be described by following pseudo code
 #
 #   .. code-block:: c
 #
 #    ct = 0
 #    while ct < max_number_of_trials:
 #        propose a batch of configs
 #        measure this batch of configs on real hardware and get results
 #        ct += batch_size
 #
 # When proposing the next batch of configs, the tuner can take different strategies. We
 # provide four tuners with different strategies in autotvm.
 #
 # * :any:`RandomTuner`: Enumerate the space in a random order
 # * :any:`GridSearchTuner`: Enumerate the space in a grid search order
 # * :any:`GATuner`: Using genetic algorithm to search through the space
 # * :any:`XGBTuner`: Uses a model based method. Train a XGBoost model to predict the speed of lowered IR and pick the next batch according to the prediction.
 #
 # You can choose the tuner according to the size of your space, your time budget and other factors.
 # For example, if your space is very small (less than 1000), a gridsearch tuner or a
 # random tuner is good enough. If your space is at the level of 10^9 (this is the space
 # size of a conv2d operator on CUDA GPU), XGBoostTuner can explore more efficiently
 # and find better configs.

 ################################################################
 # Begin tuning
 # ^^^^^^^^^^^^
 # Here we continue our matrix multiplication example.
 # First we should create a tuning task.
 # We can also inspect the initialized search space.
 # In this case, for a 512x512 square matrix multiplication, the space size
 # is 10x10=100
 N, L, M = 512, 512, 512
 task = autotvm.task.create("tutorial/matmul", args=(N, L, M, "float32"), target="llvm")
 print(task.config_space)

 ################################################################
 # Then we need to define how to measure the generated code and pick a tuner.
 # Since our space is small, a random tuner is just okay.
 #
 # We only make 10 trials in this tutorial for demonstration. In practice,
 # you can do more trials according to your time budget.
 # We will log the tuning results into a log file. This file can be
 # used to get the best config later.

 # logging config (for printing tuning log to the screen)
 logging.getLogger("autotvm").setLevel(logging.DEBUG)
 logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))

 # There are two steps for measuring a config: build and run.
 # By default, we use all CPU cores to compile program. Then measure them sequentially.
 # We measure 5 times and take average to reduce variance.
 measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5))

 # Begin tuning with RandomTuner, log records to file `matmul.log`
 # You can use alternatives like XGBTuner.
 tuner = autotvm.tuner.RandomTuner(task)
 tuner.tune(
     n_trial=10,
     measure_option=measure_option,
     callbacks=[autotvm.callback.log_to_file("matmul.log")],
 )

 #########################################################################
 # Finally we apply history best from the cache file and check its correctness.
 # We can call the function :code:`matmul` directly under the
 # :any:`autotvm.apply_history_best` context. When we call this function,
 # it will query the dispatch context with its argument and get the best config
 # with the same argument.

 # apply history best from log file
 with autotvm.apply_history_best("matmul.log"):
     with tvm.target.Target("llvm"):
         s, arg_bufs = matmul(N, L, M, "float32")
         func = tvm.build(s, arg_bufs)

 # check correctness
 a_np = np.random.uniform(size=(N, L)).astype(np.float32)
 b_np = np.random.uniform(size=(L, M)).astype(np.float32)
 c_np = a_np.dot(b_np)

 c_tvm = tvm.nd.empty(c_np.shape)
 func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)

 tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	"""
	Writing tunable template and Using auto-tuner
	=============================================
	Author: `Lianmin Zheng <https://github.com/merrymercy>`_

	This is an introduction tutorial to the auto-tuning module in TVM.

	There are two steps in auto-tuning.
	The first step is defining a search space.
	The second step is running a search algorithm to explore through this space.
	In this tutorial, you can learn how to perform these two steps in TVM.
	The whole workflow is illustrated by a matrix multiplication example.
	"""

	######################################################################
	# Install dependencies
	# --------------------
	# To use autotvm package in TVM, we need to install some extra dependencies.
	# This step (installing xgboost) can be skipped as it doesn't need XGBoost
	# (change "3" to "2" if you use python2):
	#
	# .. code-block:: bash
	#
	# pip3 install --user psutil xgboost
	#
	# To make TVM run faster in tuning, it is recommended to use cython
	# as FFI of TVM. In the root directory of TVM, execute
	# (change "3" to "2" if you use python2):
	#
	# .. code-block:: bash
	#
	# pip3 install --user cython
	# sudo make cython3
	#
	# Now return to python code. Import packages.

	import logging
	import sys

	import numpy as np
	import tvm
	from tvm import te

	# the module is called `autotvm`
	from tvm import autotvm

	######################################################################
	# Step 1: Define the search space
	# --------------------------------
	# In this section, we will rewrite a deterministic TVM schedule code to a
	# tunable schedule template. You can regard the process of search space definition
	# as the parameterization of our existing schedule code.
	#
	# To begin with, here is how we implement a blocked matrix multiplication in TVM.

	# Matmul V0: Constant tiling factor
	def matmul_v0(N, L, M, dtype):
	A = te.placeholder((N, L), name="A", dtype=dtype)
	B = te.placeholder((L, M), name="B", dtype=dtype)

	k = te.reduce_axis((0, L), name="k")
	C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
	s = te.create_schedule(C.op)

	# schedule
	y, x = s[C].op.axis
	k = s[C].op.reduce_axis[0]

	yo, yi = s[C].split(y, 8)
	xo, xi = s[C].split(x, 8)

	s[C].reorder(yo, xo, k, yi, xi)

	return s, [A, B, C]


	#####################################################################
	# Parametrize the schedule
	# ^^^^^^^^^^^^^^^^^^^^^^^^
	# In the previous schedule code, we use a constant "8" as tiling factor.
	# However, it might not be the best one because the best tiling factor depends
	# on real hardware environment and input shape.
	#
	# If you want the schedule code to be portable across a wider range of input shapes
	# and target hardware, it is better to define a set of candidate values and
	# pick the best one according to the measurement results on target hardware.
	#
	# In autotvm, we can define a tunable parameter, or a "knob" for such kind of value.

	# Matmul V1: List candidate values
	@autotvm.template("tutorial/matmul_v1") # 1. use a decorator
	def matmul_v1(N, L, M, dtype):
	A = te.placeholder((N, L), name="A", dtype=dtype)
	B = te.placeholder((L, M), name="B", dtype=dtype)

	k = te.reduce_axis((0, L), name="k")
	C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
	s = te.create_schedule(C.op)

	# schedule
	y, x = s[C].op.axis
	k = s[C].op.reduce_axis[0]

	# 2. get the config object
	cfg = autotvm.get_config()

	# 3. define search space
	cfg.define_knob("tile_y", [1, 2, 4, 8, 16])
	cfg.define_knob("tile_x", [1, 2, 4, 8, 16])

	# 4. schedule according to config
	yo, yi = s[C].split(y, cfg["tile_y"].val)
	xo, xi = s[C].split(x, cfg["tile_x"].val)

	s[C].reorder(yo, xo, k, yi, xi)

	return s, [A, B, C]


	###############################################################################
	# Here we make four modifications to the previous schedule code and get
	# a tunable "template". We can explain the modifications one by one.
	#
	# 1. Use a decorator to mark this function as a simple template.
	# 2. Get a config object:
	# You can regard this :code:`cfg` as an argument of this function but
	# we obtain it in a different way. With this argument, this function is no longer
	# a deterministic schedule code. Instead, we can pass different configurations to
	# this function and get different schedules, so this function is a "template".
	#
	# To make the template function more compact, we do two things in a single function.
	# (1) define a search space and (2) schedule according to an entity in this space.
	# To achieve this, we make :code:`cfg` be either
	# a :any:`ConfigSpace` or a :any:`ConfigEntity` object.
	#
	# When it is a :any:`ConfigSpace`, it will collect all tunable knobs in this function and
	# build the search space.
	# When it is a :any:`ConfigEntity`, it will ignore all space definition API
	# (namely, :code:`cfg.define_XXXXX(...)`). Instead, it stores deterministic values for
	# all tunable knobs, and we schedule according to these values.
	#
	# During auto-tuning, we will first call this template with a :any:`ConfigSpace`
	# object to build the search space. Then we call this template with different :any:`ConfigEntity`
	# in the built space to get different schedules. Finally we will measure the code generated by
	# different schedules and pick the best one.
	#
	# 3. Define two tunable knobs. The first one is :code:`tile_y` with
	# 5 possible values. The second one is :code:`tile_x` with a same
	# list of possible values. These two knobs are independent, so they
	# span a search space with size = 5x5 = 25
	# 4. Schedule according to the deterministic values in :code:`cfg`
	#

	#####################################################################
	# Use better space definition API
	# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	# In the previous template, we manually list all possible values for a knob.
	# This is the lowest level API to define the space.
	# However, we also provide another set of API to make the space definition
	# easier and smarter. It is recommended to use this set of high level API.
	#
	# In the following example, we use :any:`ConfigSpace.define_split` to define a split
	# knob. It will enumerate all the possible ways to split an axis and construct
	# the space.
	#
	# We also have :any:`ConfigSpace.define_reorder` for reorder knob and
	# :any:`ConfigSpace.define_annotate` for annotation like unroll, vectorization,
	# thread binding.
	# When the high level API cannot meet your requirement, you can always fall
	# back to use low level API.


	@autotvm.template("tutorial/matmul")
	def matmul(N, L, M, dtype):
	A = te.placeholder((N, L), name="A", dtype=dtype)
	B = te.placeholder((L, M), name="B", dtype=dtype)

	k = te.reduce_axis((0, L), name="k")
	C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C")
	s = te.create_schedule(C.op)

	# schedule
	y, x = s[C].op.axis
	k = s[C].op.reduce_axis[0]

	##### define space begin #####
	cfg = autotvm.get_config()
	cfg.define_split("tile_y", y, num_outputs=2)
	cfg.define_split("tile_x", x, num_outputs=2)
	##### define space end #####

	# schedule according to config
	yo, yi = cfg["tile_y"].apply(s, C, y)
	xo, xi = cfg["tile_x"].apply(s, C, x)

	s[C].reorder(yo, xo, k, yi, xi)

	return s, [A, B, C]


	######################################################################
	# .. note:: More Explanation on :code:`cfg.defile_split`
	#
	# In this template, :code:`cfg.define_split("tile_y", y, num_outputs=2)` will enumerate
	# all possible combinations that can split axis y into two axes with factors of the length of y.
	# For example, if the length of y is 32 and we want to split it into two axes
	# using factors of 32, then there are 6 possible values for
	# (length of outer axis, length of inner axis) pair, namely
	# (32, 1), (16, 2), (8, 4), (4, 8), (2, 16) or (1, 32).
	# They are just the 6 possible values of `tile_y`.
	#
	# During schedule, :code:`cfg["tile_y"]` is a :code:`SplitEntity` object.
	# We stores the lengths of outer axes and inner axes in :code:`cfg['tile_y'].size`
	# (a tuple with two elements).
	# In this template, we apply it by using :code:`yo, yi = cfg['tile_y'].apply(s, C, y)`.
	# Actually, this is equivalent to
	# :code:`yo, yi = s[C].split(y, cfg["tile_y"].size[1])`
	# or :code:`yo, yi = s[C].split(y, nparts=cfg['tile_y"].size[0])`
	#
	# The advantage of using cfg.apply API is that it makes multi-level split
	# (when num_outputs >= 3) easier.

	######################################################################
	# Step 2: Search through the space
	# ---------------------------------
	# In step 1, we build the search space by extending our old schedule code
	# into a template. The next step is to pick a tuner and explore in this space.
	#
	# Auto-tuners in TVM
	# ^^^^^^^^^^^^^^^^^^
	# The job for a tuner can be described by following pseudo code
	#
	# .. code-block:: c
	#
	# ct = 0
	# while ct < max_number_of_trials:
	# propose a batch of configs
	# measure this batch of configs on real hardware and get results
	# ct += batch_size
	#
	# When proposing the next batch of configs, the tuner can take different strategies. We
	# provide four tuners with different strategies in autotvm.
	#
	# * :any:`RandomTuner`: Enumerate the space in a random order
	# * :any:`GridSearchTuner`: Enumerate the space in a grid search order
	# * :any:`GATuner`: Using genetic algorithm to search through the space
	# * :any:`XGBTuner`: Uses a model based method. Train a XGBoost model to predict the speed of lowered IR and pick the next batch according to the prediction.
	#
	# You can choose the tuner according to the size of your space, your time budget and other factors.
	# For example, if your space is very small (less than 1000), a gridsearch tuner or a
	# random tuner is good enough. If your space is at the level of 10^9 (this is the space
	# size of a conv2d operator on CUDA GPU), XGBoostTuner can explore more efficiently
	# and find better configs.

	################################################################
	# Begin tuning
	# ^^^^^^^^^^^^
	# Here we continue our matrix multiplication example.
	# First we should create a tuning task.
	# We can also inspect the initialized search space.
	# In this case, for a 512x512 square matrix multiplication, the space size
	# is 10x10=100
	N, L, M = 512, 512, 512
	task = autotvm.task.create("tutorial/matmul", args=(N, L, M, "float32"), target="llvm")
	print(task.config_space)

	################################################################
	# Then we need to define how to measure the generated code and pick a tuner.
	# Since our space is small, a random tuner is just okay.
	#
	# We only make 10 trials in this tutorial for demonstration. In practice,
	# you can do more trials according to your time budget.
	# We will log the tuning results into a log file. This file can be
	# used to get the best config later.

	# logging config (for printing tuning log to the screen)
	logging.getLogger("autotvm").setLevel(logging.DEBUG)
	logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout))

	# There are two steps for measuring a config: build and run.
	# By default, we use all CPU cores to compile program. Then measure them sequentially.
	# We measure 5 times and take average to reduce variance.
	measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5))

	# Begin tuning with RandomTuner, log records to file `matmul.log`
	# You can use alternatives like XGBTuner.
	tuner = autotvm.tuner.RandomTuner(task)
	tuner.tune(
	n_trial=10,
	measure_option=measure_option,
	callbacks=[autotvm.callback.log_to_file("matmul.log")],
	)

	#########################################################################
	# Finally we apply history best from the cache file and check its correctness.
	# We can call the function :code:`matmul` directly under the
	# :any:`autotvm.apply_history_best` context. When we call this function,
	# it will query the dispatch context with its argument and get the best config
	# with the same argument.

	# apply history best from log file
	with autotvm.apply_history_best("matmul.log"):
	with tvm.target.Target("llvm"):
	s, arg_bufs = matmul(N, L, M, "float32")
	func = tvm.build(s, arg_bufs)

	# check correctness
	a_np = np.random.uniform(size=(N, L)).astype(np.float32)
	b_np = np.random.uniform(size=(L, M)).astype(np.float32)
	c_np = a_np.dot(b_np)

	c_tvm = tvm.nd.empty(c_np.shape)
	func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm)

	tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)