tests/python/tir-transform/test_tir_transform_profiling_instr.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import tvm
 import tvm.testing
 from tvm import te
 from tvm.ir.module import IRModule
 from tvm.script import tir as T
 import numpy

 default_lwp_test_config = {
     "tir.instrument_lwp": True,
     "tir.lwp_disable_func_prof": True,
     "tir.reset_start_id": True,
 }


 @T.prim_func
 def input1(a: T.handle, b: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (8, 8, 128), dtype="int32")
     B = T.match_buffer(b, (8, 8, 128), dtype="int32")
     C = T.match_buffer(c, (8, 8, 128), dtype="int32")
     for i, j in T.grid(8, 8):
         for k, l in T.grid(8, 16):
             with T.block("B"):
                 vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                 B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
         for k, l in T.grid(8, 16):
             with T.block("C"):
                 vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                 C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2


 @T.prim_func
 def input2(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
     A = T.match_buffer(a, (8, 8, 128), dtype="int32")
     B = T.match_buffer(b, (8, 8, 128), dtype="int32")
     C = T.match_buffer(c, (8, 8, 128), dtype="int32")
     D = T.match_buffer(d, (8, 8, 128), dtype="int32")
     for i in T.serial(0, 8):
         for j in T.serial(0, 8):
             for k, l in T.grid(8, 16):
                 with T.block("B"):
                     vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                     B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
             for k, l in T.grid(8, 16):
                 with T.block("B"):
                     vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                     B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
         for j in T.serial(0, 8):
             for k, l in T.grid(8, 16):
                 with T.block("C"):
                     vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                     C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
             for k, l in T.grid(8, 16):
                 with T.block("B"):
                     vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                     C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]


 @T.prim_func
 def input3(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
     A = T.match_buffer(a, (8, 8, 128), dtype="int32")
     B = T.match_buffer(b, (8, 8, 128), dtype="int32")
     C = T.match_buffer(c, (8, 8, 128), dtype="int32")
     D = T.match_buffer(d, (8, 8, 128), dtype="int32")
     for i in T.serial(0, 8):
         for j in T.parallel(0, 8):
             for k in T.serial(0, 8):
                 for l in T.serial(0, 16):
                     with T.block("B"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
             for k in T.serial(0, 8):
                 for l in T.serial(0, 16):
                     with T.block("B"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
         for j in T.serial(0, 8):
             for k in T.parallel(0, 8):
                 for l in T.serial(0, 16):
                     with T.block("C"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
             for k in T.parallel(0, 8):
                 for l in T.serial(0, 16):
                     with T.block("B"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]


 @T.prim_func
 def test1_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (8, 8, 128), dtype="int32")
     B = T.match_buffer(b, (8, 8, 128), dtype="int32")
     C = T.match_buffer(c, (8, 8, 128), dtype="int32")
     for i, j in T.grid(8, 8):
         T.evaluate(T.start_profile_intrinsic(3, dtype="handle"))
         for k, l in T.grid(8, 16):
             with T.block("B"):
                 vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                 B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
         T.evaluate(T.end_profile_intrinsic(3, dtype="handle"))
         T.evaluate(T.start_profile_intrinsic(5, dtype="handle"))
         for k, l in T.grid(8, 16):
             with T.block("C"):
                 vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                 C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
         T.evaluate(T.end_profile_intrinsic(5, dtype="handle"))


 @T.prim_func
 def test2_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (8, 8, 128), dtype="int32")
     B = T.match_buffer(b, (8, 8, 128), dtype="int32")
     C = T.match_buffer(c, (8, 8, 128), dtype="int32")
     T.evaluate(T.start_profile_intrinsic(1, dtype="handle"))
     for i in T.serial(0, 8):
         T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
         for j in T.serial(0, 8):
             for k in T.serial(0, 8):
                 for l in T.serial(0, 16):
                     with T.block("B"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
             for k in T.serial(0, 8):
                 for l in T.serial(0, 16):
                     with T.block("C"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
         T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
     T.evaluate(T.end_profile_intrinsic(1, dtype="handle"))


 @T.prim_func
 def test3_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (8, 8, 128), dtype="int32")
     B = T.match_buffer(b, (8, 8, 128), dtype="int32")
     C = T.match_buffer(c, (8, 8, 128), dtype="int32")
     T.evaluate(T.start_profile_intrinsic(1, dtype="handle"))
     for i in T.serial(0, 8):
         T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
         for j in T.serial(0, 8):
             T.evaluate(T.start_profile_intrinsic(3, dtype="handle"))
             for k in T.serial(0, 8):
                 for l in T.serial(0, 16):
                     with T.block("B"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
             T.evaluate(T.end_profile_intrinsic(3, dtype="handle"))
             T.evaluate(T.start_profile_intrinsic(5, dtype="handle"))
             for k in T.serial(0, 8):
                 for l in T.serial(0, 16):
                     with T.block("C"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
             T.evaluate(T.end_profile_intrinsic(5, dtype="handle"))
         T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
     T.evaluate(T.end_profile_intrinsic(1, dtype="handle"))


 @T.prim_func
 def test4_expected_output(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
     A = T.match_buffer(a, (8, 8, 128), dtype="int32")
     B = T.match_buffer(b, (8, 8, 128), dtype="int32")
     C = T.match_buffer(c, (8, 8, 128), dtype="int32")
     D = T.match_buffer(d, (8, 8, 128), dtype="int32")
     for i in T.serial(0, 8):
         T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
         for j in T.serial(0, 8):
             T.evaluate(T.start_profile_intrinsic(3, dtype="handle"))
             for k, l in T.grid(8, 16):
                 with T.block("B"):
                     vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                     B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
             T.evaluate(T.end_profile_intrinsic(3, dtype="handle"))
             T.evaluate(T.start_profile_intrinsic(5, dtype="handle"))
             for k, l in T.grid(8, 16):
                 with T.block("B"):
                     vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                     B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
             T.evaluate(T.end_profile_intrinsic(5, dtype="handle"))
         T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
         T.evaluate(T.start_profile_intrinsic(7, dtype="handle"))
         for j in T.serial(0, 8):
             T.evaluate(T.start_profile_intrinsic(8, dtype="handle"))
             for k, l in T.grid(8, 16):
                 with T.block("C"):
                     vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                     C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
             T.evaluate(T.end_profile_intrinsic(8, dtype="handle"))
             T.evaluate(T.start_profile_intrinsic(10, dtype="handle"))
             for k, l in T.grid(8, 16):
                 with T.block("B"):
                     vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                     C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
             T.evaluate(T.end_profile_intrinsic(10, dtype="handle"))
         T.evaluate(T.end_profile_intrinsic(7, dtype="handle"))


 @T.prim_func
 def test5_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (8, 8, 128), dtype="int32")
     B = T.match_buffer(b, (8, 8, 128), dtype="int32")
     C = T.match_buffer(c, (8, 8, 128), dtype="int32")
     T.evaluate(T.start_profile_intrinsic(1, dtype="handle"))
     for i in T.serial(0, 8):
         T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
         for j in T.serial(0, 8):
             for k in T.serial(0, 8):
                 for l in T.serial(0, 16):
                     with T.block("B"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
             for k in T.serial(0, 8):
                 for l in T.serial(0, 16):
                     with T.block("C"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
         T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
     T.evaluate(T.end_profile_intrinsic(1, dtype="handle"))


 @T.prim_func
 def test6_expected_output(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
     A = T.match_buffer(a, (8, 8, 128), dtype="int32")
     B = T.match_buffer(b, (8, 8, 128), dtype="int32")
     C = T.match_buffer(c, (8, 8, 128), dtype="int32")
     D = T.match_buffer(d, (8, 8, 128), dtype="int32")
     for i in T.serial(0, 8):
         T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
         for j in T.parallel(0, 8):
             for k in T.serial(0, 8):
                 for l in T.serial(0, 16):
                     with T.block("B"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
             for k in T.serial(0, 8):
                 for l in T.serial(0, 16):
                     with T.block("B"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
         T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
         T.evaluate(T.start_profile_intrinsic(7, dtype="handle"))
         for j in T.serial(0, 8):
             T.evaluate(T.start_profile_intrinsic(8, dtype="handle"))
             for k in T.parallel(0, 8):
                 for l in T.serial(0, 16):
                     with T.block("C"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
             T.evaluate(T.end_profile_intrinsic(8, dtype="handle"))
             T.evaluate(T.start_profile_intrinsic(10, dtype="handle"))
             for k in T.parallel(0, 8):
                 for l in T.serial(0, 16):
                     with T.block("B"):
                         vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
                         C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
             T.evaluate(T.end_profile_intrinsic(10, dtype="handle"))
         T.evaluate(T.end_profile_intrinsic(7, dtype="handle"))


 # By default, only loops with siblings are instrumented.
 def test1():
     with tvm.transform.PassContext(config=default_lwp_test_config):
         mod = tvm.IRModule.from_expr(input1.with_attr("global_symbol", "main"))
         mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
     tvm.ir.assert_structural_equal(
         mod["main"], test1_expected_output.with_attr("global_symbol", "main")
     )


 # By default, only loops with siblings are instrumented. Here, 'lwp_max_depth'
 # doesn't have any effect unless 'instr_siblings' is set to False (ex: test3).
 def test2():
     test2_config = default_lwp_test_config.copy()
     test2_config.update({"tir.lwp_max_depth": 3})
     with tvm.transform.PassContext(config=test2_config):
         mod = tvm.IRModule.from_expr(input1.with_attr("global_symbol", "main"))
         mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
     tvm.ir.assert_structural_equal(
         mod["main"], test1_expected_output.with_attr("global_symbol", "main")
     )


 # test3: Use 'lwp_max_depth' to instrument loops upto a certain depth. This flag
 # is effective only when 'instr_siblings' is disabled. Also, note that inner-most
 # loops are always excluded from instrumentation unless overwritten using
 # 'lwp_min_height' (ex: test5)
 def test3():
     test3_config = default_lwp_test_config.copy()
     test3_config.update({"tir.lwp_max_depth": 3, "tir.instr_siblings": False})
     with tvm.transform.PassContext(config=test3_config):
         mod = tvm.IRModule.from_expr(input1.with_attr("global_symbol", "main"))
         mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
     tvm.ir.assert_structural_equal(
         mod["main"], test3_expected_output.with_attr("global_symbol", "main")
     )


 # test4: Use 'lwp_min_height' to exclude inner loops upto a certain height from
 # instrumentation.
 def test4():
     with tvm.transform.PassContext(config=default_lwp_test_config):
         mod = tvm.IRModule.from_expr(input2.with_attr("global_symbol", "main"))
         mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
     tvm.ir.assert_structural_equal(
         mod["main"], test4_expected_output.with_attr("global_symbol", "main")
     )


 # test5: Use both 'lwp_min_height' and 'lwp_max_depth'.
 # instrumentation.
 def test5():
     test5_config = default_lwp_test_config.copy()
     test5_config.update(
         {"tir.lwp_max_depth": 3, "tir.instr_siblings": False, "tir.lwp_min_height": 2}
     )
     with tvm.transform.PassContext(config=test5_config):
         mod = tvm.IRModule.from_expr(input1.with_attr("global_symbol", "main"))
         mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
     tvm.ir.assert_structural_equal(
         mod["main"], test5_expected_output.with_attr("global_symbol", "main")
     )


 # test6: Tests instrumentation for the parallel loops
 def test6():
     with tvm.transform.PassContext(config=default_lwp_test_config):
         mod = tvm.IRModule.from_expr(input3.with_attr("global_symbol", "main"))
         mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
     tvm.ir.assert_structural_equal(
         mod["main"], test6_expected_output.with_attr("global_symbol", "main")
     )


 if __name__ == "__main__":
     tvm.testing.main()
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	import tvm
	import tvm.testing
	from tvm import te
	from tvm.ir.module import IRModule
	from tvm.script import tir as T
	import numpy

	default_lwp_test_config = {
	"tir.instrument_lwp": True,
	"tir.lwp_disable_func_prof": True,
	"tir.reset_start_id": True,
	}


	@T.prim_func
	def input1(a: T.handle, b: T.handle, c: T.handle) -> None:
	A = T.match_buffer(a, (8, 8, 128), dtype="int32")
	B = T.match_buffer(b, (8, 8, 128), dtype="int32")
	C = T.match_buffer(c, (8, 8, 128), dtype="int32")
	for i, j in T.grid(8, 8):
	for k, l in T.grid(8, 16):
	with T.block("B"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
	for k, l in T.grid(8, 16):
	with T.block("C"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2


	@T.prim_func
	def input2(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
	A = T.match_buffer(a, (8, 8, 128), dtype="int32")
	B = T.match_buffer(b, (8, 8, 128), dtype="int32")
	C = T.match_buffer(c, (8, 8, 128), dtype="int32")
	D = T.match_buffer(d, (8, 8, 128), dtype="int32")
	for i in T.serial(0, 8):
	for j in T.serial(0, 8):
	for k, l in T.grid(8, 16):
	with T.block("B"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
	for k, l in T.grid(8, 16):
	with T.block("B"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
	for j in T.serial(0, 8):
	for k, l in T.grid(8, 16):
	with T.block("C"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
	for k, l in T.grid(8, 16):
	with T.block("B"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]


	@T.prim_func
	def input3(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
	A = T.match_buffer(a, (8, 8, 128), dtype="int32")
	B = T.match_buffer(b, (8, 8, 128), dtype="int32")
	C = T.match_buffer(c, (8, 8, 128), dtype="int32")
	D = T.match_buffer(d, (8, 8, 128), dtype="int32")
	for i in T.serial(0, 8):
	for j in T.parallel(0, 8):
	for k in T.serial(0, 8):
	for l in T.serial(0, 16):
	with T.block("B"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
	for k in T.serial(0, 8):
	for l in T.serial(0, 16):
	with T.block("B"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
	for j in T.serial(0, 8):
	for k in T.parallel(0, 8):
	for l in T.serial(0, 16):
	with T.block("C"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
	for k in T.parallel(0, 8):
	for l in T.serial(0, 16):
	with T.block("B"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]


	@T.prim_func
	def test1_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
	A = T.match_buffer(a, (8, 8, 128), dtype="int32")
	B = T.match_buffer(b, (8, 8, 128), dtype="int32")
	C = T.match_buffer(c, (8, 8, 128), dtype="int32")
	for i, j in T.grid(8, 8):
	T.evaluate(T.start_profile_intrinsic(3, dtype="handle"))
	for k, l in T.grid(8, 16):
	with T.block("B"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
	T.evaluate(T.end_profile_intrinsic(3, dtype="handle"))
	T.evaluate(T.start_profile_intrinsic(5, dtype="handle"))
	for k, l in T.grid(8, 16):
	with T.block("C"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
	T.evaluate(T.end_profile_intrinsic(5, dtype="handle"))


	@T.prim_func
	def test2_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
	A = T.match_buffer(a, (8, 8, 128), dtype="int32")
	B = T.match_buffer(b, (8, 8, 128), dtype="int32")
	C = T.match_buffer(c, (8, 8, 128), dtype="int32")
	T.evaluate(T.start_profile_intrinsic(1, dtype="handle"))
	for i in T.serial(0, 8):
	T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
	for j in T.serial(0, 8):
	for k in T.serial(0, 8):
	for l in T.serial(0, 16):
	with T.block("B"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
	for k in T.serial(0, 8):
	for l in T.serial(0, 16):
	with T.block("C"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
	T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
	T.evaluate(T.end_profile_intrinsic(1, dtype="handle"))


	@T.prim_func
	def test3_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
	A = T.match_buffer(a, (8, 8, 128), dtype="int32")
	B = T.match_buffer(b, (8, 8, 128), dtype="int32")
	C = T.match_buffer(c, (8, 8, 128), dtype="int32")
	T.evaluate(T.start_profile_intrinsic(1, dtype="handle"))
	for i in T.serial(0, 8):
	T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
	for j in T.serial(0, 8):
	T.evaluate(T.start_profile_intrinsic(3, dtype="handle"))
	for k in T.serial(0, 8):
	for l in T.serial(0, 16):
	with T.block("B"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
	T.evaluate(T.end_profile_intrinsic(3, dtype="handle"))
	T.evaluate(T.start_profile_intrinsic(5, dtype="handle"))
	for k in T.serial(0, 8):
	for l in T.serial(0, 16):
	with T.block("C"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
	T.evaluate(T.end_profile_intrinsic(5, dtype="handle"))
	T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
	T.evaluate(T.end_profile_intrinsic(1, dtype="handle"))


	@T.prim_func
	def test4_expected_output(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
	A = T.match_buffer(a, (8, 8, 128), dtype="int32")
	B = T.match_buffer(b, (8, 8, 128), dtype="int32")
	C = T.match_buffer(c, (8, 8, 128), dtype="int32")
	D = T.match_buffer(d, (8, 8, 128), dtype="int32")
	for i in T.serial(0, 8):
	T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
	for j in T.serial(0, 8):
	T.evaluate(T.start_profile_intrinsic(3, dtype="handle"))
	for k, l in T.grid(8, 16):
	with T.block("B"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
	T.evaluate(T.end_profile_intrinsic(3, dtype="handle"))
	T.evaluate(T.start_profile_intrinsic(5, dtype="handle"))
	for k, l in T.grid(8, 16):
	with T.block("B"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
	T.evaluate(T.end_profile_intrinsic(5, dtype="handle"))
	T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
	T.evaluate(T.start_profile_intrinsic(7, dtype="handle"))
	for j in T.serial(0, 8):
	T.evaluate(T.start_profile_intrinsic(8, dtype="handle"))
	for k, l in T.grid(8, 16):
	with T.block("C"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
	T.evaluate(T.end_profile_intrinsic(8, dtype="handle"))
	T.evaluate(T.start_profile_intrinsic(10, dtype="handle"))
	for k, l in T.grid(8, 16):
	with T.block("B"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
	T.evaluate(T.end_profile_intrinsic(10, dtype="handle"))
	T.evaluate(T.end_profile_intrinsic(7, dtype="handle"))


	@T.prim_func
	def test5_expected_output(a: T.handle, b: T.handle, c: T.handle) -> None:
	A = T.match_buffer(a, (8, 8, 128), dtype="int32")
	B = T.match_buffer(b, (8, 8, 128), dtype="int32")
	C = T.match_buffer(c, (8, 8, 128), dtype="int32")
	T.evaluate(T.start_profile_intrinsic(1, dtype="handle"))
	for i in T.serial(0, 8):
	T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
	for j in T.serial(0, 8):
	for k in T.serial(0, 8):
	for l in T.serial(0, 16):
	with T.block("B"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
	for k in T.serial(0, 8):
	for l in T.serial(0, 16):
	with T.block("C"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * 2
	T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
	T.evaluate(T.end_profile_intrinsic(1, dtype="handle"))


	@T.prim_func
	def test6_expected_output(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None:
	A = T.match_buffer(a, (8, 8, 128), dtype="int32")
	B = T.match_buffer(b, (8, 8, 128), dtype="int32")
	C = T.match_buffer(c, (8, 8, 128), dtype="int32")
	D = T.match_buffer(d, (8, 8, 128), dtype="int32")
	for i in T.serial(0, 8):
	T.evaluate(T.start_profile_intrinsic(2, dtype="handle"))
	for j in T.parallel(0, 8):
	for k in T.serial(0, 8):
	for l in T.serial(0, 16):
	with T.block("B"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	B[vi, vj, vk * 16 + vl] = A[vi, vj, vk * 16 + vl] * 2
	for k in T.serial(0, 8):
	for l in T.serial(0, 16):
	with T.block("B"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	B[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
	T.evaluate(T.end_profile_intrinsic(2, dtype="handle"))
	T.evaluate(T.start_profile_intrinsic(7, dtype="handle"))
	for j in T.serial(0, 8):
	T.evaluate(T.start_profile_intrinsic(8, dtype="handle"))
	for k in T.parallel(0, 8):
	for l in T.serial(0, 16):
	with T.block("C"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	C[vi, vj, vk * 16 + vl] = B[vi, vj, vk * 16 + vl] + 2
	T.evaluate(T.end_profile_intrinsic(8, dtype="handle"))
	T.evaluate(T.start_profile_intrinsic(10, dtype="handle"))
	for k in T.parallel(0, 8):
	for l in T.serial(0, 16):
	with T.block("B"):
	vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
	C[vi, vj, vk * 16 + vl] = C[vi, vj, vk * 16 + vl] * D[vi, vj, vk * 16 + vl]
	T.evaluate(T.end_profile_intrinsic(10, dtype="handle"))
	T.evaluate(T.end_profile_intrinsic(7, dtype="handle"))


	# By default, only loops with siblings are instrumented.
	def test1():
	with tvm.transform.PassContext(config=default_lwp_test_config):
	mod = tvm.IRModule.from_expr(input1.with_attr("global_symbol", "main"))
	mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
	tvm.ir.assert_structural_equal(
	mod["main"], test1_expected_output.with_attr("global_symbol", "main")
	)


	# By default, only loops with siblings are instrumented. Here, 'lwp_max_depth'
	# doesn't have any effect unless 'instr_siblings' is set to False (ex: test3).
	def test2():
	test2_config = default_lwp_test_config.copy()
	test2_config.update({"tir.lwp_max_depth": 3})
	with tvm.transform.PassContext(config=test2_config):
	mod = tvm.IRModule.from_expr(input1.with_attr("global_symbol", "main"))
	mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
	tvm.ir.assert_structural_equal(
	mod["main"], test1_expected_output.with_attr("global_symbol", "main")
	)


	# test3: Use 'lwp_max_depth' to instrument loops upto a certain depth. This flag
	# is effective only when 'instr_siblings' is disabled. Also, note that inner-most
	# loops are always excluded from instrumentation unless overwritten using
	# 'lwp_min_height' (ex: test5)
	def test3():
	test3_config = default_lwp_test_config.copy()
	test3_config.update({"tir.lwp_max_depth": 3, "tir.instr_siblings": False})
	with tvm.transform.PassContext(config=test3_config):
	mod = tvm.IRModule.from_expr(input1.with_attr("global_symbol", "main"))
	mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
	tvm.ir.assert_structural_equal(
	mod["main"], test3_expected_output.with_attr("global_symbol", "main")
	)


	# test4: Use 'lwp_min_height' to exclude inner loops upto a certain height from
	# instrumentation.
	def test4():
	with tvm.transform.PassContext(config=default_lwp_test_config):
	mod = tvm.IRModule.from_expr(input2.with_attr("global_symbol", "main"))
	mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
	tvm.ir.assert_structural_equal(
	mod["main"], test4_expected_output.with_attr("global_symbol", "main")
	)


	# test5: Use both 'lwp_min_height' and 'lwp_max_depth'.
	# instrumentation.
	def test5():
	test5_config = default_lwp_test_config.copy()
	test5_config.update(
	{"tir.lwp_max_depth": 3, "tir.instr_siblings": False, "tir.lwp_min_height": 2}
	)
	with tvm.transform.PassContext(config=test5_config):
	mod = tvm.IRModule.from_expr(input1.with_attr("global_symbol", "main"))
	mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
	tvm.ir.assert_structural_equal(
	mod["main"], test5_expected_output.with_attr("global_symbol", "main")
	)


	# test6: Tests instrumentation for the parallel loops
	def test6():
	with tvm.transform.PassContext(config=default_lwp_test_config):
	mod = tvm.IRModule.from_expr(input3.with_attr("global_symbol", "main"))
	mod = tvm.tir.transform.InstrumentProfileIntrinsics()(mod)
	tvm.ir.assert_structural_equal(
	mod["main"], test6_expected_output.with_attr("global_symbol", "main")
	)


	if __name__ == "__main__":
	tvm.testing.main()