tests/python/tir-base/test_tir_ir_builder.py - tvm - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 import tvm
 from tvm import te
 import numpy as np
 import tvm.testing
 from tvm.topi.math import cast


 def test_for():
     ib = tvm.tir.ir_builder.create()
     n = te.size_var("n")
     A = ib.allocate("float32", n, name="A", scope="global")
     with ib.for_range(0, n, name="i") as i:
         A[i] = A[i] + 1
         with ib.for_range(0, 10, name="j") as j:
             A[j] = A[j] + 2

     body = ib.get()
     assert isinstance(body, tvm.tir.Allocate)
     body = body.body
     assert isinstance(body, tvm.tir.For)
     body = body.body
     assert isinstance(body, tvm.tir.SeqStmt)
     assert isinstance(body[1], tvm.tir.For)


 def test_if():
     ib = tvm.tir.ir_builder.create()
     n = te.size_var("n")
     A = ib.pointer("float32", name="A")
     tmod = tvm.tir.truncmod
     with ib.for_range(0, n, name="i") as i:
         with ib.if_scope(tmod(i, 2) == 0):
             A[i] = A[i] + 1
         with ib.else_scope():
             A[0] = A[i] + 2

     body = ib.get()
     assert A == A
     assert isinstance(body, tvm.tir.For)
     body = body.body
     assert isinstance(body, tvm.tir.IfThenElse)
     assert isinstance(body.condition, tvm.tir.EQ)
     assert isinstance(body.then_case.indices[0], tvm.tir.Var)
     assert list(body.else_case.indices) == [0]


 def test_prefetch():
     A = tvm.tir.decl_buffer((10, 20), name="A")
     ib = tvm.tir.ir_builder.create()
     n = te.size_var("n")

     with ib.for_range(0, n, name="i") as i:
         ib.emit(
             tvm.tir.Prefetch(
                 A, [tvm.ir.Range.from_min_extent(i + 1, 2), tvm.ir.Range.from_min_extent(0, 20)]
             )
         )
     body = ib.get()
     assert body.body.bounds[0].extent.value == 2


 def test_cpu():
     n = 1024
     dtype = "float32"
     A = te.placeholder((n,), name="A")
     B = te.placeholder((n,), name="B")

     def test_device_ir(A, B, C):
         n = A.shape[0]
         max_threads = 8
         ib = tvm.tir.ir_builder.create()
         Aptr = ib.buffer_ptr(A)
         Bptr = ib.buffer_ptr(B)
         Cptr = ib.buffer_ptr(C)
         with ib.for_range(0, n, name="i") as i:
             Cptr[i] = Aptr[i] + Bptr[i]
         body = ib.get()
         return body

     C = te.extern(
         A.shape,
         [A, B],
         lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
         name="vector_add",
         dtype=dtype,
     )
     s = te.create_schedule(C.op)

     def check_target(target):
         if not tvm.testing.device_enabled(target):
             return
         # build and invoke the kernel.
         fadd = tvm.build(s, [A, B, C], target)
         dev = tvm.device(target, 0)
         # launch the kernel.
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
         b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         fadd(a, b, c)
         tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())

     check_target("llvm")


 @tvm.testing.requires_gpu
 def test_gpu():
     n = te.size_var("n")
     dtype = "float32"
     A = te.placeholder((n,), name="A")
     B = te.placeholder((n,), name="B")
     idxd = tvm.tir.indexdiv

     def test_device_ir(A, B, C):
         n = A.shape[0]
         max_threads = 32
         ib = tvm.tir.ir_builder.create()
         bx = te.thread_axis("blockIdx.x")
         tx = te.thread_axis("threadIdx.x")
         ib.scope_attr(bx, "thread_extent", idxd(n + max_threads - 1, max_threads))
         ib.scope_attr(tx, "thread_extent", max_threads)
         idx = bx.var * max_threads + tx.var
         Aptr = ib.buffer_ptr(A)
         Bptr = ib.buffer_ptr(B)
         Cptr = ib.buffer_ptr(C)
         with ib.if_scope(ib.likely(idx < n)):
             Cptr[idx] = Aptr[idx] + Bptr[idx]
         body = ib.get()
         return body

     C = te.extern(
         A.shape,
         [A, B],
         lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
         name="vector_add",
         dtype=dtype,
     )
     s = te.create_schedule(C.op)
     bounds = tvm.te.schedule.InferBound(s)
     stmt = tvm.te.schedule.ScheduleOps(s, bounds)

     def check_target(target):
         n = 1024
         if not tvm.testing.device_enabled(target):
             return
         # build and invoke the kernel.
         fadd = tvm.build(s, [A, B, C], target)
         dev = tvm.device(target, 0)
         # launch the kernel.
         a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
         b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         fadd(a, b, c)
         tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())

     check_target("opencl")
     check_target("cuda")


 def test_while_vectorize():
     """Test while loop + vectorized inner loop"""

     n = 64
     num_iter = 10

     def test_ir(A, B, C):
         ib = tvm.tir.ir_builder.create()
         n = C.shape[0]
         A = ib.buffer_ptr(A)
         B = ib.buffer_ptr(B)
         C = ib.buffer_ptr(C)
         i = ib.allocate("int32", (1,), name="i", scope="local")
         i[0] = 0

         with ib.for_range(0, n) as j:
             C[j] = 0.0

         with ib.while_loop(i[0] < num_iter):
             with ib.for_range(0, n, kind="vectorize") as j:
                 C[j] += A[j] + B[j]
             i[0] += 1

         return ib.get()

     def check_target(target, ir):
         dtype = "float32"
         A = te.placeholder((n,), name="A", dtype=dtype)
         B = te.placeholder((n,), name="B", dtype=dtype)

         C = te.extern(
             (n,),
             [A, B],
             lambda ins, outs: ir(ins[0], ins[1], outs[0]),
             name="while_vectorize",
             dtype=dtype,
         )
         s = te.create_schedule(C.op)

         with tvm.transform.PassContext(opt_level=3):
             func = tvm.build(s, [A, B, C], target)

         dev = tvm.device(target, 0)
         a_np = np.random.uniform(size=n).astype(A.dtype)
         b_np = np.random.uniform(size=n).astype(B.dtype)
         a = tvm.nd.array(a_np, dev)
         b = tvm.nd.array(b_np, dev)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         func(a, b, c)
         ref = num_iter * (a_np + b_np)
         tvm.testing.assert_allclose(c.numpy(), ref, rtol=1e-5, atol=1e-5)

     check_target("llvm", test_ir)


 def test_while_collatz():
     """Test while loop + if"""

     def collatz_ref(n):
         a = n
         i = 0
         while a > 1:
             if a % 2 == 1:
                 a = 3 * a + 1
             else:
                 a = a >> 1
             i += 1
         return i

     def collatz(ib, n, C):
         i = ib.allocate("int32", (1,), name="i", scope="local")
         a = ib.allocate("int32", (1,), name="a", scope="local")
         i[0] = 0
         a[0] = n
         with ib.while_loop(a[0] > 1):
             with ib.if_scope(tvm.tir.floormod(a[0], 2) == 1):
                 a[0] = 3 * a[0] + 1
             with ib.else_scope():
                 a[0] = a[0] >> 1
             i[0] += 1

         C[n] = i[0]

     def collatz_ir_cpu(C):
         ib = tvm.tir.ir_builder.create()
         n = C.shape[0]
         C = ib.buffer_ptr(C)

         with ib.for_range(0, n, name="i", kind="parallel") as i:
             collatz(ib, i, C)

         body = ib.get()

         return body

     n = 30

     def check_target(target, ir):
         C = te.extern(
             (n,),
             [],
             lambda ins, outs: ir(outs[0]),
             name="collatz",
             dtype="int32",
         )
         s = te.create_schedule(C.op)

         with tvm.transform.PassContext(opt_level=3):
             func = tvm.build(s, [C], target)

         dev = tvm.device(target, 0)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         func(c)
         ref = np.array([collatz_ref(i) for i in range(n)])
         tvm.testing.assert_allclose(c.numpy(), ref)

     check_target("llvm", collatz_ir_cpu)


 def test_while_mandel():
     n = 160
     shape = (n * 2, n)
     t = 300

     def mandel_ref():
         def complex_sqr(z):
             return np.array([z[0] ** 2 - z[1] ** 2, z[1] * z[0] * 2])

         pixels = np.zeros(shape)

         for i in range(pixels.shape[0]):
             for j in range(pixels.shape[1]):
                 c = np.array([-0.8, np.cos(t) * 0.2])
                 z = np.array([i / n - 1, j / n - 0.5]) * 2
                 iterations = 0

                 while np.linalg.norm(z) < 20 and iterations < 50:
                     z = complex_sqr(z) + c
                     iterations += 1

                 pixels[i, j] = 1 - iterations * 0.02

         return pixels

     def mandel(ib, i, j, pixels):
         z = ib.allocate("float32", (2,), name="z", scope="local")
         tmp = ib.allocate("float32", (1,), name="tmp", scope="local")
         iterations = ib.allocate("int32", (1,), name="iterations", scope="local")

         z[0] = (i / float(n) - 1) * 2
         z[1] = (j / float(n) - 0.5) * 2
         iterations[0] = 0
         c = [-0.8, float(np.cos(t)) * 0.2]

         def norm(z):
             return tvm.tir.sqrt(z[0] * z[0] + z[1] * z[1])

         with ib.while_loop(tvm.tir.all(norm(z) < 20, iterations[0] < 50)):
             tmp[0] = z[0]
             z[0] = z[0] * z[0] - z[1] * z[1] + c[0]
             z[1] = z[1] * tmp[0] * 2 + c[1]
             iterations[0] += 1

         pixels[i, j] = 1 - iterations[0] * 0.02

     def mandel_ir_cpu(C):
         ib = tvm.tir.ir_builder.create()
         ny = C.shape[0]
         nx = C.shape[1]
         C = ib.buffer_ptr(C)

         with ib.for_range(0, ny, name="i", kind="parallel") as i:
             with ib.for_range(0, nx, name="j") as j:
                 mandel(ib, i, j, C)

         body = ib.get()

         return body

     def mandel_ir_gpu(C):
         ib = tvm.tir.ir_builder.create()
         ny = C.shape[0]
         nx = C.shape[1]
         C = ib.buffer_ptr(C)

         bx = te.thread_axis("blockIdx.x")
         tx = te.thread_axis("threadIdx.x")
         by = te.thread_axis("blockIdx.y")
         ty = te.thread_axis("threadIdx.y")

         max_threads = 16
         ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(nx + max_threads - 1, max_threads))
         ib.scope_attr(tx, "thread_extent", max_threads)
         ib.scope_attr(by, "thread_extent", tvm.tir.indexdiv(ny + max_threads - 1, max_threads))
         ib.scope_attr(ty, "thread_extent", max_threads)

         tidx = bx * max_threads + tx
         tidy = by * max_threads + ty

         with ib.if_scope(tvm.tir.all(tidx < nx, tidy < ny)):
             mandel(ib, tidy, tidx, C)

         body = ib.get()

         return body

     ref = mandel_ref()

     def check_target(target, ir):
         if not tvm.testing.device_enabled(target):
             return

         C = te.extern(
             shape,
             [],
             lambda ins, outs: ir(outs[0]),
             name="mandel_ir",
             dtype="float32",
         )
         s = te.create_schedule(C.op)

         with tvm.transform.PassContext(opt_level=3):
             func = tvm.build(s, [C], target)

         dev = tvm.device(target, 0)
         c = tvm.nd.array(np.zeros(shape, dtype=C.dtype), dev)
         func(c)
         tvm.testing.assert_allclose(c.numpy(), ref, rtol=1e-5, atol=1e-5)

     check_target("llvm", mandel_ir_cpu)
     check_target("npvtx", mandel_ir_gpu)
     check_target("cuda", mandel_ir_gpu)
     check_target("vulkan", mandel_ir_gpu)


 def test_while_binary_search():
     def binary_search(ib, n, i, Aptr, Bptr, Cptr):
         lo = ib.allocate("int32", (1,), name="lo", scope="local")
         hi = ib.allocate("int32", (1,), name="hi", scope="local")

         lo[0] = 0
         hi[0] = n
         v = Bptr[i]

         with ib.while_loop(lo[0] < hi[0]):
             mid = lo[0] + (hi[0] - lo[0] >> 1)
             with ib.if_scope(Aptr[mid] < v):
                 lo[0] = mid + 1
             with ib.else_scope():
                 hi[0] = mid

         Cptr[i] = lo[0]

     def searchsorted_ir_cpu(A, B, C, n):
         ib = tvm.tir.ir_builder.create()
         Aptr = ib.buffer_ptr(A)
         Bptr = ib.buffer_ptr(B)
         Cptr = ib.buffer_ptr(C)

         with ib.for_range(0, n, name="i", kind="parallel") as i:
             binary_search(ib, n, i, Aptr, Bptr, Cptr)

         body = ib.get()

         return body

     def searchsorted_ir_gpu(A, B, C, n):
         ib = tvm.tir.ir_builder.create()
         Aptr = ib.buffer_ptr(A)
         Bptr = ib.buffer_ptr(B)
         Cptr = ib.buffer_ptr(C)

         bx = te.thread_axis("blockIdx.x")
         tx = te.thread_axis("threadIdx.x")
         max_threads = 32
         ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(n + max_threads - 1, max_threads))
         ib.scope_attr(tx, "thread_extent", max_threads)
         tid = bx * max_threads + tx

         with ib.if_scope(tid < n):
             binary_search(ib, n, tid, Aptr, Bptr, Cptr)

         body = ib.get()

         return body

     n = 1024
     dtype = "float32"
     A = te.placeholder((n,), name="A", dtype=dtype)
     B = te.placeholder((n,), name="B", dtype=dtype)

     def check_target(target, ir):
         if not tvm.testing.device_enabled(target):
             return

         C = te.extern(
             A.shape,
             [A, B],
             lambda ins, outs: ir(ins[0], ins[1], outs[0], n),
             name="searchsorted_ir",
             dtype="int32",
         )
         s = te.create_schedule(C.op)

         with tvm.transform.PassContext(opt_level=3):
             func = tvm.build(s, [A, B, C], target)

         dev = tvm.device(target, 0)
         a_np = np.random.uniform(size=n).astype(A.dtype)
         b_np = np.random.uniform(size=n).astype(B.dtype)
         a_np = np.sort(a_np)
         a = tvm.nd.array(a_np, dev)
         b = tvm.nd.array(b_np, dev)
         c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
         func(a, b, c)
         ref = np.searchsorted(a_np, b_np)
         tvm.testing.assert_allclose(c.numpy(), ref)

     check_target("llvm", searchsorted_ir_cpu)
     check_target("cuda", searchsorted_ir_gpu)
     check_target("nvptx", searchsorted_ir_gpu)
     check_target("vulkan", searchsorted_ir_gpu)


 @tvm.testing.requires_gpu
 def test_dyn_shared():
     n = te.size_var("n")
     dtype = "float32"
     A = te.placeholder((n,), name="A")

     def test_device_ir(A, B):
         n = A.shape[0]
         ib = tvm.tir.ir_builder.create()

         tx = te.thread_axis("threadIdx.x")
         ib.scope_attr(tx, "thread_extent", n)

         temp = ib.allocate(dtype, (n,), scope="shared.dyn")  # n is symbolic size

         Aptr = ib.buffer_ptr(A)
         Bptr = ib.buffer_ptr(B)

         temp[tx] = Aptr[tx]
         depth = tvm.tir.log2(cast(n, "float32"))

         with ib.for_range(0, cast(tvm.tir.ceil(depth), n.dtype)) as i:
             ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"])))
             d = n >> (i + 1)
             with ib.if_scope(tx < d):
                 temp[tx] += temp[tx + d]

         Bptr[0] = temp[0]
         return ib.get()

     B = te.extern(
         (1,),
         [A],
         lambda ins, outs: test_device_ir(ins[0], outs[0]),
         name="reduce",
         dtype=dtype,
     )
     s = te.create_schedule(B.op)

     def check_target(target):
         if not tvm.testing.device_enabled(target):
             return

         freduce = tvm.build(s, [A, B], target)
         dev = tvm.device(target, 0)

         for n in [512, 1024]:
             a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
             b = tvm.nd.array(np.zeros(1, dtype=B.dtype), dev)
             freduce(a, b)
             tvm.testing.assert_allclose(b.numpy()[0], np.sum(a.numpy()), 1e-4, 1e-4)

     for target in ["cuda", "nvptx"]:
         check_target(target)


 if __name__ == "__main__":
     test_prefetch()
     test_if()
     test_for()
     test_cpu()
     test_gpu()
     test_while_vectorize()
     test_while_collatz()
     test_while_mandel()
     test_while_binary_search()
     test_dyn_shared()
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	import tvm
	from tvm import te
	import numpy as np
	import tvm.testing
	from tvm.topi.math import cast


	def test_for():
	ib = tvm.tir.ir_builder.create()
	n = te.size_var("n")
	A = ib.allocate("float32", n, name="A", scope="global")
	with ib.for_range(0, n, name="i") as i:
	A[i] = A[i] + 1
	with ib.for_range(0, 10, name="j") as j:
	A[j] = A[j] + 2

	body = ib.get()
	assert isinstance(body, tvm.tir.Allocate)
	body = body.body
	assert isinstance(body, tvm.tir.For)
	body = body.body
	assert isinstance(body, tvm.tir.SeqStmt)
	assert isinstance(body[1], tvm.tir.For)


	def test_if():
	ib = tvm.tir.ir_builder.create()
	n = te.size_var("n")
	A = ib.pointer("float32", name="A")
	tmod = tvm.tir.truncmod
	with ib.for_range(0, n, name="i") as i:
	with ib.if_scope(tmod(i, 2) == 0):
	A[i] = A[i] + 1
	with ib.else_scope():
	A[0] = A[i] + 2

	body = ib.get()
	assert A == A
	assert isinstance(body, tvm.tir.For)
	body = body.body
	assert isinstance(body, tvm.tir.IfThenElse)
	assert isinstance(body.condition, tvm.tir.EQ)
	assert isinstance(body.then_case.indices[0], tvm.tir.Var)
	assert list(body.else_case.indices) == [0]


	def test_prefetch():
	A = tvm.tir.decl_buffer((10, 20), name="A")
	ib = tvm.tir.ir_builder.create()
	n = te.size_var("n")

	with ib.for_range(0, n, name="i") as i:
	ib.emit(
	tvm.tir.Prefetch(
	A, [tvm.ir.Range.from_min_extent(i + 1, 2), tvm.ir.Range.from_min_extent(0, 20)]
	)
	)
	body = ib.get()
	assert body.body.bounds[0].extent.value == 2


	def test_cpu():
	n = 1024
	dtype = "float32"
	A = te.placeholder((n,), name="A")
	B = te.placeholder((n,), name="B")

	def test_device_ir(A, B, C):
	n = A.shape[0]
	max_threads = 8
	ib = tvm.tir.ir_builder.create()
	Aptr = ib.buffer_ptr(A)
	Bptr = ib.buffer_ptr(B)
	Cptr = ib.buffer_ptr(C)
	with ib.for_range(0, n, name="i") as i:
	Cptr[i] = Aptr[i] + Bptr[i]
	body = ib.get()
	return body

	C = te.extern(
	A.shape,
	[A, B],
	lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
	name="vector_add",
	dtype=dtype,
	)
	s = te.create_schedule(C.op)

	def check_target(target):
	if not tvm.testing.device_enabled(target):
	return
	# build and invoke the kernel.
	fadd = tvm.build(s, [A, B, C], target)
	dev = tvm.device(target, 0)
	# launch the kernel.
	a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
	b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
	c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
	fadd(a, b, c)
	tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())

	check_target("llvm")


	@tvm.testing.requires_gpu
	def test_gpu():
	n = te.size_var("n")
	dtype = "float32"
	A = te.placeholder((n,), name="A")
	B = te.placeholder((n,), name="B")
	idxd = tvm.tir.indexdiv

	def test_device_ir(A, B, C):
	n = A.shape[0]
	max_threads = 32
	ib = tvm.tir.ir_builder.create()
	bx = te.thread_axis("blockIdx.x")
	tx = te.thread_axis("threadIdx.x")
	ib.scope_attr(bx, "thread_extent", idxd(n + max_threads - 1, max_threads))
	ib.scope_attr(tx, "thread_extent", max_threads)
	idx = bx.var * max_threads + tx.var
	Aptr = ib.buffer_ptr(A)
	Bptr = ib.buffer_ptr(B)
	Cptr = ib.buffer_ptr(C)
	with ib.if_scope(ib.likely(idx < n)):
	Cptr[idx] = Aptr[idx] + Bptr[idx]
	body = ib.get()
	return body

	C = te.extern(
	A.shape,
	[A, B],
	lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
	name="vector_add",
	dtype=dtype,
	)
	s = te.create_schedule(C.op)
	bounds = tvm.te.schedule.InferBound(s)
	stmt = tvm.te.schedule.ScheduleOps(s, bounds)

	def check_target(target):
	n = 1024
	if not tvm.testing.device_enabled(target):
	return
	# build and invoke the kernel.
	fadd = tvm.build(s, [A, B, C], target)
	dev = tvm.device(target, 0)
	# launch the kernel.
	a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
	b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
	c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
	fadd(a, b, c)
	tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())

	check_target("opencl")
	check_target("cuda")


	def test_while_vectorize():
	"""Test while loop + vectorized inner loop"""

	n = 64
	num_iter = 10

	def test_ir(A, B, C):
	ib = tvm.tir.ir_builder.create()
	n = C.shape[0]
	A = ib.buffer_ptr(A)
	B = ib.buffer_ptr(B)
	C = ib.buffer_ptr(C)
	i = ib.allocate("int32", (1,), name="i", scope="local")
	i[0] = 0

	with ib.for_range(0, n) as j:
	C[j] = 0.0

	with ib.while_loop(i[0] < num_iter):
	with ib.for_range(0, n, kind="vectorize") as j:
	C[j] += A[j] + B[j]
	i[0] += 1

	return ib.get()

	def check_target(target, ir):
	dtype = "float32"
	A = te.placeholder((n,), name="A", dtype=dtype)
	B = te.placeholder((n,), name="B", dtype=dtype)

	C = te.extern(
	(n,),
	[A, B],
	lambda ins, outs: ir(ins[0], ins[1], outs[0]),
	name="while_vectorize",
	dtype=dtype,
	)
	s = te.create_schedule(C.op)

	with tvm.transform.PassContext(opt_level=3):
	func = tvm.build(s, [A, B, C], target)

	dev = tvm.device(target, 0)
	a_np = np.random.uniform(size=n).astype(A.dtype)
	b_np = np.random.uniform(size=n).astype(B.dtype)
	a = tvm.nd.array(a_np, dev)
	b = tvm.nd.array(b_np, dev)
	c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
	func(a, b, c)
	ref = num_iter * (a_np + b_np)
	tvm.testing.assert_allclose(c.numpy(), ref, rtol=1e-5, atol=1e-5)

	check_target("llvm", test_ir)


	def test_while_collatz():
	"""Test while loop + if"""

	def collatz_ref(n):
	a = n
	i = 0
	while a > 1:
	if a % 2 == 1:
	a = 3 * a + 1
	else:
	a = a >> 1
	i += 1
	return i

	def collatz(ib, n, C):
	i = ib.allocate("int32", (1,), name="i", scope="local")
	a = ib.allocate("int32", (1,), name="a", scope="local")
	i[0] = 0
	a[0] = n
	with ib.while_loop(a[0] > 1):
	with ib.if_scope(tvm.tir.floormod(a[0], 2) == 1):
	a[0] = 3 * a[0] + 1
	with ib.else_scope():
	a[0] = a[0] >> 1
	i[0] += 1

	C[n] = i[0]

	def collatz_ir_cpu(C):
	ib = tvm.tir.ir_builder.create()
	n = C.shape[0]
	C = ib.buffer_ptr(C)

	with ib.for_range(0, n, name="i", kind="parallel") as i:
	collatz(ib, i, C)

	body = ib.get()

	return body

	n = 30

	def check_target(target, ir):
	C = te.extern(
	(n,),
	[],
	lambda ins, outs: ir(outs[0]),
	name="collatz",
	dtype="int32",
	)
	s = te.create_schedule(C.op)

	with tvm.transform.PassContext(opt_level=3):
	func = tvm.build(s, [C], target)

	dev = tvm.device(target, 0)
	c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
	func(c)
	ref = np.array([collatz_ref(i) for i in range(n)])
	tvm.testing.assert_allclose(c.numpy(), ref)

	check_target("llvm", collatz_ir_cpu)


	def test_while_mandel():
	n = 160
	shape = (n * 2, n)
	t = 300

	def mandel_ref():
	def complex_sqr(z):
	return np.array([z[0] 2 - z[1] 2, z[1] * z[0] * 2])

	pixels = np.zeros(shape)

	for i in range(pixels.shape[0]):
	for j in range(pixels.shape[1]):
	c = np.array([-0.8, np.cos(t) * 0.2])
	z = np.array([i / n - 1, j / n - 0.5]) * 2
	iterations = 0

	while np.linalg.norm(z) < 20 and iterations < 50:
	z = complex_sqr(z) + c
	iterations += 1

	pixels[i, j] = 1 - iterations * 0.02

	return pixels

	def mandel(ib, i, j, pixels):
	z = ib.allocate("float32", (2,), name="z", scope="local")
	tmp = ib.allocate("float32", (1,), name="tmp", scope="local")
	iterations = ib.allocate("int32", (1,), name="iterations", scope="local")

	z[0] = (i / float(n) - 1) * 2
	z[1] = (j / float(n) - 0.5) * 2
	iterations[0] = 0
	c = [-0.8, float(np.cos(t)) * 0.2]

	def norm(z):
	return tvm.tir.sqrt(z[0] * z[0] + z[1] * z[1])

	with ib.while_loop(tvm.tir.all(norm(z) < 20, iterations[0] < 50)):
	tmp[0] = z[0]
	z[0] = z[0] * z[0] - z[1] * z[1] + c[0]
	z[1] = z[1] * tmp[0] * 2 + c[1]
	iterations[0] += 1

	pixels[i, j] = 1 - iterations[0] * 0.02

	def mandel_ir_cpu(C):
	ib = tvm.tir.ir_builder.create()
	ny = C.shape[0]
	nx = C.shape[1]
	C = ib.buffer_ptr(C)

	with ib.for_range(0, ny, name="i", kind="parallel") as i:
	with ib.for_range(0, nx, name="j") as j:
	mandel(ib, i, j, C)

	body = ib.get()

	return body

	def mandel_ir_gpu(C):
	ib = tvm.tir.ir_builder.create()
	ny = C.shape[0]
	nx = C.shape[1]
	C = ib.buffer_ptr(C)

	bx = te.thread_axis("blockIdx.x")
	tx = te.thread_axis("threadIdx.x")
	by = te.thread_axis("blockIdx.y")
	ty = te.thread_axis("threadIdx.y")

	max_threads = 16
	ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(nx + max_threads - 1, max_threads))
	ib.scope_attr(tx, "thread_extent", max_threads)
	ib.scope_attr(by, "thread_extent", tvm.tir.indexdiv(ny + max_threads - 1, max_threads))
	ib.scope_attr(ty, "thread_extent", max_threads)

	tidx = bx * max_threads + tx
	tidy = by * max_threads + ty

	with ib.if_scope(tvm.tir.all(tidx < nx, tidy < ny)):
	mandel(ib, tidy, tidx, C)

	body = ib.get()

	return body

	ref = mandel_ref()

	def check_target(target, ir):
	if not tvm.testing.device_enabled(target):
	return

	C = te.extern(
	shape,
	[],
	lambda ins, outs: ir(outs[0]),
	name="mandel_ir",
	dtype="float32",
	)
	s = te.create_schedule(C.op)

	with tvm.transform.PassContext(opt_level=3):
	func = tvm.build(s, [C], target)

	dev = tvm.device(target, 0)
	c = tvm.nd.array(np.zeros(shape, dtype=C.dtype), dev)
	func(c)
	tvm.testing.assert_allclose(c.numpy(), ref, rtol=1e-5, atol=1e-5)

	check_target("llvm", mandel_ir_cpu)
	check_target("npvtx", mandel_ir_gpu)
	check_target("cuda", mandel_ir_gpu)
	check_target("vulkan", mandel_ir_gpu)


	def test_while_binary_search():
	def binary_search(ib, n, i, Aptr, Bptr, Cptr):
	lo = ib.allocate("int32", (1,), name="lo", scope="local")
	hi = ib.allocate("int32", (1,), name="hi", scope="local")

	lo[0] = 0
	hi[0] = n
	v = Bptr[i]

	with ib.while_loop(lo[0] < hi[0]):
	mid = lo[0] + (hi[0] - lo[0] >> 1)
	with ib.if_scope(Aptr[mid] < v):
	lo[0] = mid + 1
	with ib.else_scope():
	hi[0] = mid

	Cptr[i] = lo[0]

	def searchsorted_ir_cpu(A, B, C, n):
	ib = tvm.tir.ir_builder.create()
	Aptr = ib.buffer_ptr(A)
	Bptr = ib.buffer_ptr(B)
	Cptr = ib.buffer_ptr(C)

	with ib.for_range(0, n, name="i", kind="parallel") as i:
	binary_search(ib, n, i, Aptr, Bptr, Cptr)

	body = ib.get()

	return body

	def searchsorted_ir_gpu(A, B, C, n):
	ib = tvm.tir.ir_builder.create()
	Aptr = ib.buffer_ptr(A)
	Bptr = ib.buffer_ptr(B)
	Cptr = ib.buffer_ptr(C)

	bx = te.thread_axis("blockIdx.x")
	tx = te.thread_axis("threadIdx.x")
	max_threads = 32
	ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(n + max_threads - 1, max_threads))
	ib.scope_attr(tx, "thread_extent", max_threads)
	tid = bx * max_threads + tx

	with ib.if_scope(tid < n):
	binary_search(ib, n, tid, Aptr, Bptr, Cptr)

	body = ib.get()

	return body

	n = 1024
	dtype = "float32"
	A = te.placeholder((n,), name="A", dtype=dtype)
	B = te.placeholder((n,), name="B", dtype=dtype)

	def check_target(target, ir):
	if not tvm.testing.device_enabled(target):
	return

	C = te.extern(
	A.shape,
	[A, B],
	lambda ins, outs: ir(ins[0], ins[1], outs[0], n),
	name="searchsorted_ir",
	dtype="int32",
	)
	s = te.create_schedule(C.op)

	with tvm.transform.PassContext(opt_level=3):
	func = tvm.build(s, [A, B, C], target)

	dev = tvm.device(target, 0)
	a_np = np.random.uniform(size=n).astype(A.dtype)
	b_np = np.random.uniform(size=n).astype(B.dtype)
	a_np = np.sort(a_np)
	a = tvm.nd.array(a_np, dev)
	b = tvm.nd.array(b_np, dev)
	c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
	func(a, b, c)
	ref = np.searchsorted(a_np, b_np)
	tvm.testing.assert_allclose(c.numpy(), ref)

	check_target("llvm", searchsorted_ir_cpu)
	check_target("cuda", searchsorted_ir_gpu)
	check_target("nvptx", searchsorted_ir_gpu)
	check_target("vulkan", searchsorted_ir_gpu)


	@tvm.testing.requires_gpu
	def test_dyn_shared():
	n = te.size_var("n")
	dtype = "float32"
	A = te.placeholder((n,), name="A")

	def test_device_ir(A, B):
	n = A.shape[0]
	ib = tvm.tir.ir_builder.create()

	tx = te.thread_axis("threadIdx.x")
	ib.scope_attr(tx, "thread_extent", n)

	temp = ib.allocate(dtype, (n,), scope="shared.dyn") # n is symbolic size

	Aptr = ib.buffer_ptr(A)
	Bptr = ib.buffer_ptr(B)

	temp[tx] = Aptr[tx]
	depth = tvm.tir.log2(cast(n, "float32"))

	with ib.for_range(0, cast(tvm.tir.ceil(depth), n.dtype)) as i:
	ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"])))
	d = n >> (i + 1)
	with ib.if_scope(tx < d):
	temp[tx] += temp[tx + d]

	Bptr[0] = temp[0]
	return ib.get()

	B = te.extern(
	(1,),
	[A],
	lambda ins, outs: test_device_ir(ins[0], outs[0]),
	name="reduce",
	dtype=dtype,
	)
	s = te.create_schedule(B.op)

	def check_target(target):
	if not tvm.testing.device_enabled(target):
	return

	freduce = tvm.build(s, [A, B], target)
	dev = tvm.device(target, 0)

	for n in [512, 1024]:
	a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
	b = tvm.nd.array(np.zeros(1, dtype=B.dtype), dev)
	freduce(a, b)
	tvm.testing.assert_allclose(b.numpy()[0], np.sum(a.numpy()), 1e-4, 1e-4)

	for target in ["cuda", "nvptx"]:
	check_target(target)


	if __name__ == "__main__":
	test_prefetch()
	test_if()
	test_for()
	test_cpu()
	test_gpu()
	test_while_vectorize()
	test_while_collatz()
	test_while_mandel()
	test_while_binary_search()
	test_dyn_shared()