tutorials/auto_scheduler/tune_matmul_x86.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 """
 Auto-scheduling matrix multiplication for CPU
 =============================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
             `Chengfan Jia <https://github.com/jcf94/>`_

 Different from the existing :ref:`autotvm <tutorials-autotvm-sec>` which relies on
 manual templates to define the search space, the auto-scheduler does not require any templates.
 The auto-scheduler is template-free, so users only need to write the computation declaration without
 any schedule commands or templates.
 The auto-scheduler can automatically generate a large
 search space and find a good schedule in the space.

 We use matrix multiplication as an example in this tutorial.
 """

 import numpy as np
 import tvm
 from tvm import te, auto_scheduler

 ######################################################################
 # Define the computation
 # ^^^^^^^^^^^^^^^^^^^^^^
 # To begin with, let us define the computation of a matmul with bias add.
 # The function should return the list of input/output tensors.
 # From these tensors, the auto-scheduler can get the whole computational graph.


 @auto_scheduler.register_workload
 def matmul_add(N, L, M, dtype):
     A = te.placeholder((N, L), name="A", dtype=dtype)
     B = te.placeholder((L, M), name="B", dtype=dtype)
     C = te.placeholder((N, M), name="C", dtype=dtype)

     k = te.reduce_axis((0, L), name="k")
     matmul = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="matmul")
     out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out")

     return [A, B, C, out]


 ######################################################################
 # Create the search task
 # ^^^^^^^^^^^^^^^^^^^^^^
 # We then create a search task with N=L=M=128 and dtype="float32"
 # If your machine supports avx instructions, you can
 #
 #   - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2
 #   - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512

 target = tvm.target.Target("llvm")
 task = tvm.auto_scheduler.create_task(matmul_add, (128, 128, 128, "float32"), target)

 # Inspect the computational graph
 print(task.compute_dag)

 ######################################################################
 # Next, we set parameters for the auto-scheduler.
 #
 # * :code:`num_measure_trials` is the number of measurement trials we can use during the search.
 #   We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a
 #   good value for the search to converge. You can do more trials according to your time budget.
 # * In addition, we use :code:`RecordToFile` to dump measurement records into a file `matmul.json`.
 #   The measurement records can be used to query the history best, resume the search,
 #   and do more analyses later.
 # * see :any:`auto_scheduler.TuningOptions` for more parameters

 tune_option = auto_scheduler.TuningOptions(
     num_measure_trials=10, measure_callbacks=[auto_scheduler.RecordToFile("matmul.json")]
 )

 ######################################################################
 # Run the search
 # ^^^^^^^^^^^^^^
 # Now we get all inputs ready. Pretty simple, isn't it?
 # We can kick off the search and let the auto-scheduler do its magic.
 # After some measurement trials, it will return the best schedule it found.

 sch, args = auto_scheduler.auto_schedule(task, tuning_options=tune_option)

 ######################################################################
 # We can lower the schedule to see the IR after auto-scheduling.
 # The auto-scheduler correctly performs optimizations including multi-level tiling,
 # parallelization, vectorization, unrolling and operator fusion.

 print(tvm.lower(sch, args, simple_mode=True))

 ######################################################################
 # Check correctness and evaluate performance
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 # We build the binary and check its correctness and performance.

 func = tvm.build(sch, args)
 a_np = np.random.uniform(size=(128, 128)).astype(np.float32)
 b_np = np.random.uniform(size=(128, 128)).astype(np.float32)
 c_np = np.random.uniform(size=(128, 128)).astype(np.float32)
 out_np = a_np.dot(b_np) + c_np

 ctx = tvm.cpu()
 a_tvm = tvm.nd.array(a_np, ctx=ctx)
 b_tvm = tvm.nd.array(b_np, ctx=ctx)
 c_tvm = tvm.nd.array(c_np, ctx=ctx)
 out_tvm = tvm.nd.empty(out_np.shape, ctx=ctx)
 func(a_tvm, b_tvm, c_tvm, out_tvm)

 # Check results
 np.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3)

 # Evaluate execution time.
 evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=500)
 print(
     "Execution time of this operator: %.3f ms"
     % (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000)
 )


 ######################################################################
 # Using the record file
 # ^^^^^^^^^^^^^^^^^^^^^
 # During the search, all measuremnt records are dumpped into the record
 # file "matmul.json". The measurement records can be used to re-apply search results,
 # resume the search, and perform other analyses.

 ######################################################################
 # Here is an example where we load the best schedule from a file,
 # print the equivalent python schedule API, and build the binary again.

 # Load the measuremnt record for the best schedule
 inp, res = auto_scheduler.load_best("matmul.json", task.workload_key)

 # Print equivalent python schedule API. This can be used for debugging and
 # learning the behavior of the auto-scheduler.
 print("Equivalent python schedule:")
 print(task.compute_dag.print_python_code_from_state(inp.state))

 # Rebuild the binary. This shows how you can apply the best schedule from a
 # log file without reruning the search again.
 sch, args = task.compute_dag.apply_steps_from_state(inp.state)
 func = tvm.build(sch, args)

 ######################################################################
 # A more complicated example is to resume the search.
 # In this case, we need to create the search policy and cost model by ourselves
 # and resume the status of search policy and cost model with the log file.
 # In the example below we resume the status and do more 5 trials.


 def resume_search(task, log_file):
     cost_model = auto_scheduler.XGBModel()
     cost_model.update_from_file(log_file)
     search_policy = auto_scheduler.SketchPolicy(
         task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
     )
     tune_option = auto_scheduler.TuningOptions(
         num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file)]
     )
     sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options=tune_option)


 # resume_search(task, "matmul.json")

 ######################################################################
 # .. note::
 #   We cannot run the line above because of the conflict between
 #   python's multiprocessing and tvm's thread pool.
 #   After running a tvm generated binary the python's multiprocessing library
 #   will hang forever. You have to make sure that you don't run any tvm
 #   generated binaries before calling auot-scheduler's search.
 #   To run the function above, you should comment out all code in
 #   "Check correctness and evaluate performance" section.
 #
 #   You should be careful about this problem in your applications.
 #   There are other workarounds for this problem.
 #   For example, you can start a new thread/process (with the builtin python library
 #   threading or multiprocessing) and run the tvm binaries in the new thread/process.
 #   This provides an isolation and avoids the conflict in the main thread/process.
 #   You can also use :any:`auto_scheduler.LocalRPCMeasureContext` for auto-scheduler,
 #   as shown in the GPU tutorial (:ref:`auto-scheduler-conv-gpu`).
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	"""
	Auto-scheduling matrix multiplication for CPU
	=============================================
	Author: `Lianmin Zheng <https://github.com/merrymercy>`_, \
	`Chengfan Jia <https://github.com/jcf94/>`_

	Different from the existing :ref:`autotvm <tutorials-autotvm-sec>` which relies on
	manual templates to define the search space, the auto-scheduler does not require any templates.
	The auto-scheduler is template-free, so users only need to write the computation declaration without
	any schedule commands or templates.
	The auto-scheduler can automatically generate a large
	search space and find a good schedule in the space.

	We use matrix multiplication as an example in this tutorial.
	"""

	import numpy as np
	import tvm
	from tvm import te, auto_scheduler

	######################################################################
	# Define the computation
	# ^^^^^^^^^^^^^^^^^^^^^^
	# To begin with, let us define the computation of a matmul with bias add.
	# The function should return the list of input/output tensors.
	# From these tensors, the auto-scheduler can get the whole computational graph.


	@auto_scheduler.register_workload
	def matmul_add(N, L, M, dtype):
	A = te.placeholder((N, L), name="A", dtype=dtype)
	B = te.placeholder((L, M), name="B", dtype=dtype)
	C = te.placeholder((N, M), name="C", dtype=dtype)

	k = te.reduce_axis((0, L), name="k")
	matmul = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="matmul")
	out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out")

	return [A, B, C, out]


	######################################################################
	# Create the search task
	# ^^^^^^^^^^^^^^^^^^^^^^
	# We then create a search task with N=L=M=128 and dtype="float32"
	# If your machine supports avx instructions, you can
	#
	# - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2
	# - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512

	target = tvm.target.Target("llvm")
	task = tvm.auto_scheduler.create_task(matmul_add, (128, 128, 128, "float32"), target)

	# Inspect the computational graph
	print(task.compute_dag)

	######################################################################
	# Next, we set parameters for the auto-scheduler.
	#
	# * :code:`num_measure_trials` is the number of measurement trials we can use during the search.
	# We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a
	# good value for the search to converge. You can do more trials according to your time budget.
	# * In addition, we use :code:`RecordToFile` to dump measurement records into a file `matmul.json`.
	# The measurement records can be used to query the history best, resume the search,
	# and do more analyses later.
	# * see :any:`auto_scheduler.TuningOptions` for more parameters

	tune_option = auto_scheduler.TuningOptions(
	num_measure_trials=10, measure_callbacks=[auto_scheduler.RecordToFile("matmul.json")]
	)

	######################################################################
	# Run the search
	# ^^^^^^^^^^^^^^
	# Now we get all inputs ready. Pretty simple, isn't it?
	# We can kick off the search and let the auto-scheduler do its magic.
	# After some measurement trials, it will return the best schedule it found.

	sch, args = auto_scheduler.auto_schedule(task, tuning_options=tune_option)

	######################################################################
	# We can lower the schedule to see the IR after auto-scheduling.
	# The auto-scheduler correctly performs optimizations including multi-level tiling,
	# parallelization, vectorization, unrolling and operator fusion.

	print(tvm.lower(sch, args, simple_mode=True))

	######################################################################
	# Check correctness and evaluate performance
	# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	# We build the binary and check its correctness and performance.

	func = tvm.build(sch, args)
	a_np = np.random.uniform(size=(128, 128)).astype(np.float32)
	b_np = np.random.uniform(size=(128, 128)).astype(np.float32)
	c_np = np.random.uniform(size=(128, 128)).astype(np.float32)
	out_np = a_np.dot(b_np) + c_np

	ctx = tvm.cpu()
	a_tvm = tvm.nd.array(a_np, ctx=ctx)
	b_tvm = tvm.nd.array(b_np, ctx=ctx)
	c_tvm = tvm.nd.array(c_np, ctx=ctx)
	out_tvm = tvm.nd.empty(out_np.shape, ctx=ctx)
	func(a_tvm, b_tvm, c_tvm, out_tvm)

	# Check results
	np.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3)

	# Evaluate execution time.
	evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=500)
	print(
	"Execution time of this operator: %.3f ms"
	% (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000)
	)


	######################################################################
	# Using the record file
	# ^^^^^^^^^^^^^^^^^^^^^
	# During the search, all measuremnt records are dumpped into the record
	# file "matmul.json". The measurement records can be used to re-apply search results,
	# resume the search, and perform other analyses.

	######################################################################
	# Here is an example where we load the best schedule from a file,
	# print the equivalent python schedule API, and build the binary again.

	# Load the measuremnt record for the best schedule
	inp, res = auto_scheduler.load_best("matmul.json", task.workload_key)

	# Print equivalent python schedule API. This can be used for debugging and
	# learning the behavior of the auto-scheduler.
	print("Equivalent python schedule:")
	print(task.compute_dag.print_python_code_from_state(inp.state))

	# Rebuild the binary. This shows how you can apply the best schedule from a
	# log file without reruning the search again.
	sch, args = task.compute_dag.apply_steps_from_state(inp.state)
	func = tvm.build(sch, args)

	######################################################################
	# A more complicated example is to resume the search.
	# In this case, we need to create the search policy and cost model by ourselves
	# and resume the status of search policy and cost model with the log file.
	# In the example below we resume the status and do more 5 trials.


	def resume_search(task, log_file):
	cost_model = auto_scheduler.XGBModel()
	cost_model.update_from_file(log_file)
	search_policy = auto_scheduler.SketchPolicy(
	task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
	)
	tune_option = auto_scheduler.TuningOptions(
	num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file)]
	)
	sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options=tune_option)


	# resume_search(task, "matmul.json")

	######################################################################
	# .. note::
	# We cannot run the line above because of the conflict between
	# python's multiprocessing and tvm's thread pool.
	# After running a tvm generated binary the python's multiprocessing library
	# will hang forever. You have to make sure that you don't run any tvm
	# generated binaries before calling auot-scheduler's search.
	# To run the function above, you should comment out all code in
	# "Check correctness and evaluate performance" section.
	#
	# You should be careful about this problem in your applications.
	# There are other workarounds for this problem.
	# For example, you can start a new thread/process (with the builtin python library
	# threading or multiprocessing) and run the tvm binaries in the new thread/process.
	# This provides an isolation and avoids the conflict in the main thread/process.
	# You can also use :any:`auto_scheduler.LocalRPCMeasureContext` for auto-scheduler,
	# as shown in the GPU tutorial (:ref:`auto-scheduler-conv-gpu`).