tests/python/unittest/test_vta_insn.py - tvm-vta - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 """Unit test VTA's instructions """
 import tvm
 from tvm import te
 import numpy as np
 import topi
 from tvm.contrib import util

 import vta
 import vta.testing
 from vta.testing import simulator

 np.random.seed(0xdeadb)

 def test_save_load_out():
     """Test save/store output command"""
     def _run(env, remote):
         n = 6
         x = te.placeholder(
             (n, n, env.BATCH, env.BLOCK_OUT),
             name="x",
             dtype=env.acc_dtype)
         x_buf = te.compute(
             (n, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: x(*i), "x_buf")
         # insert no-op that won't be optimized away
         y_buf = te.compute(
             (n, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: x_buf(*i)>>0, "y_buf")
         y = te.compute(
             (n, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: y_buf(*i).astype(env.inp_dtype), "y")
         # schedule
         s = te.create_schedule(y.op)
         s[x_buf].set_scope(env.acc_scope)
         s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy)
         s[y_buf].set_scope(env.acc_scope)
         s[y_buf].pragma(y_buf.op.axis[0], env.alu)
         s[y].pragma(y.op.axis[0], env.dma_copy)

         # verification
         with vta.build_config():
             m = vta.build(s, [x, y], "ext_dev", env.target_host)

         if not remote:
             return
         temp = util.tempdir()
         m.save(temp.relpath("load_act.o"))
         remote.upload(temp.relpath("load_act.o"))
         f = remote.load_module("load_act.o")
         # verify
         ctx = remote.ext_dev(0)
         x_np = np.random.randint(
             1, 10, size=(n, n, env.BATCH, env.BLOCK_OUT)).astype(x.dtype)
         y_np = x_np.astype(y.dtype)
         x_nd = tvm.nd.array(x_np, ctx)
         y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype)

         if env.TARGET in ["sim", "tsim"]:
             simulator.clear_stats()

         f(x_nd, y_nd)

         np.testing.assert_equal(y_np, y_nd.asnumpy())

         if env.TARGET in ["sim", "tsim"]:
             sim_stats = simulator.stats()
             print("Save load execution statistics:")
             for k, v in sim_stats.items():
                 print("\t{:<16}: {:>16}".format(k, v))

     vta.testing.run(_run)


 def test_padded_load():
     """Test padded load."""
     def _run(env, remote):
         def check_padded_load(pad_before, pad_after, test_name=None):
             # declare
             n = 3
             m = 5
             x = te.placeholder(
                 (n, m, env.BATCH, env.BLOCK_OUT),
                 name="x",
                 dtype=env.acc_dtype)
             x_buf = topi.nn.pad(x, pad_before, pad_after, name="y")
             # insert no-op that won't be optimized away
             y_buf = te.compute((n + pad_before[0] + pad_after[0],
                                  m + pad_before[1] + pad_after[1],
                                  env.BATCH,
                                  env.BLOCK_OUT), lambda *i: x_buf(*i)>>0, "y_buf")
             y = te.compute((n + pad_before[0] + pad_after[0],
                              m + pad_before[1] + pad_after[1],
                              env.BATCH,
                              env.BLOCK_OUT), lambda *i: y_buf(*i).astype(env.inp_dtype), "y")
             # schedule
             s = te.create_schedule(y.op)
             s[x_buf].set_scope(env.acc_scope)
             s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy)
             s[y_buf].set_scope(env.acc_scope)
             s[y_buf].pragma(y_buf.op.axis[0], env.alu)
             s[y].pragma(y.op.axis[0], env.dma_copy)
             # build
             with vta.build_config():
                 mod = vta.build(s, [x, y], "ext_dev", env.target_host)

             if not remote:
                 return
             temp = util.tempdir()
             mod.save(temp.relpath("padded_load.o"))
             remote.upload(temp.relpath("padded_load.o"))
             f = remote.load_module("padded_load.o")
             # verify
             ctx = remote.ext_dev(0)
             x_np = np.random.randint(0, 10, size=(
                 n, m, env.BATCH, env.BLOCK_OUT)).astype(x.dtype)
             y_np = np.zeros((n + pad_before[0] + pad_after[0],
                              m + pad_before[1] + pad_after[1],
                              env.BATCH,
                              env.BLOCK_OUT)).astype(y.dtype)
             y_np[pad_before[0]:pad_before[0] + n,
                  pad_before[1]:pad_before[1] + m,
                  :] = x_np
             x_nd = tvm.nd.array(x_np, ctx)
             y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype)

             if env.TARGET in ["sim", "tsim"]:
                 simulator.clear_stats()

             f(x_nd, y_nd)

             np.testing.assert_equal(y_np, y_nd.asnumpy())

             if env.TARGET in ["sim", "tsim"]:
                 sim_stats = simulator.stats()
                 print("Padded {} load execution statistics:".format(test_name))
                 for k, v in sim_stats.items():
                     print("\t{:<16}: {:>16}".format(k, v))

         check_padded_load([2, 0, 0, 0], [0, 0, 0, 0], test_name="Y0")
         check_padded_load([0, 2, 0, 0], [0, 0, 0, 0], test_name="Y1")
         check_padded_load([0, 0, 0, 0], [2, 0, 0, 0], test_name="X0")
         check_padded_load([0, 0, 0, 0], [0, 2, 0, 0], test_name="X1")
         check_padded_load([1, 1, 0, 0], [1, 1, 0, 0], test_name="all")

     vta.testing.run(_run)


 def test_gemm():
     """Test GEMM."""
     def _run(env, remote):
         # declare
         o = 4
         n = 1
         m = 4
         x = te.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="x", dtype=env.inp_dtype)
         w = te.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="w", dtype=env.wgt_dtype)
         x_buf = te.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: x(*i), "x_buf")
         w_buf = te.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda *i: w(*i), "w_buf")
         ko = te.reduce_axis((0, n), name="ko")
         ki = te.reduce_axis((0, env.BLOCK_IN), name="ki")
         y_gem = te.compute(
             (o, m, env.BATCH, env.BLOCK_OUT),
             lambda bo, co, bi, ci:
             te.sum(x_buf[bo, ko, bi, ki].astype(env.acc_dtype) *
                     w_buf[co, ko, ci, ki].astype(env.acc_dtype),
                     axis=[ko, ki]),
             name="y_gem")
         y_shf = te.compute(
             (o, m, env.BATCH, env.BLOCK_OUT),
             lambda *i: y_gem(*i)>>8,
             name="y_shf")
         y_max = te.compute(
             (o, m, env.BATCH, env.BLOCK_OUT),
             lambda *i: tvm.te.max(y_shf(*i), 0),
             "y_max") #relu
         y_min = te.compute(
             (o, m, env.BATCH, env.BLOCK_OUT),
             lambda *i: tvm.te.min(y_max(*i), (1<<(env.INP_WIDTH-1))-1),
             "y_min") #relu
         y = te.compute(
             (o, m, env.BATCH, env.BLOCK_OUT),
             lambda *i: y_min(*i).astype(env.inp_dtype),
             name="y")

         if not remote:
             return

         def verify(s, name=None):
             mod = vta.build(s, [x, w, y], "ext_dev", env.target_host)
             temp = util.tempdir()
             mod.save(temp.relpath("gemm.o"))
             remote.upload(temp.relpath("gemm.o"))
             f = remote.load_module("gemm.o")
             # verify
             ctx = remote.ext_dev(0)
             x_np = np.random.randint(
                 -128, 128, size=(o, n, env.BATCH, env.BLOCK_IN)).astype(x.dtype)
             w_np = np.random.randint(
                 -128, 128, size=(m, n, env.BLOCK_OUT, env.BLOCK_IN)).astype(w.dtype)
             y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype)
             x_nd = tvm.nd.array(x_np, ctx)
             w_nd = tvm.nd.array(w_np, ctx)
             y_nd = tvm.nd.array(y_np, ctx)
             y_np = y_np.astype(env.acc_dtype)
             for b in range(o):
                 for i in range(m):
                     for j in range(n):
                         y_np[b,i,:] += np.dot(x_np[b,j,:].astype(env.acc_dtype),
                                               w_np[i,j].T.astype(env.acc_dtype))
             y_np = np.right_shift(y_np, 8)
             y_np = np.clip(y_np, 0, (1<<(env.INP_WIDTH-1))-1).astype(y.dtype)

             if env.TARGET in ["sim", "tsim"]:
                 simulator.clear_stats()

             f(x_nd, w_nd, y_nd)

             np.testing.assert_equal(y_np, y_nd.asnumpy())

             if env.TARGET in ["sim", "tsim"]:
                 sim_stats = simulator.stats()
                 print("GEMM schedule:{} execution statistics:".format(name))
                 for k, v in sim_stats.items():
                     print("\t{:<16}: {:>16}".format(k, v))

         def test_schedule1():
             # default schedule with no smt
             s = te.create_schedule(y.op)
             # set the scope of the SRAM buffers
             s[x_buf].set_scope(env.inp_scope)
             s[w_buf].set_scope(env.wgt_scope)
             s[y_gem].set_scope(env.acc_scope)
             s[y_shf].set_scope(env.acc_scope)
             s[y_max].set_scope(env.acc_scope)
             s[y_min].set_scope(env.acc_scope)
             # set pragmas for DMA transfer and ALU ops
             s[x_buf].compute_at(s[y_gem], ko)
             s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy)
             s[w_buf].compute_at(s[y_gem], ko)
             s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy)
             s[y_shf].pragma(s[y_shf].op.axis[0], env.alu)
             s[y_max].pragma(s[y_max].op.axis[0], env.alu)
             s[y_min].pragma(s[y_min].op.axis[0], env.alu)
             s[y].pragma(s[y].op.axis[0], env.dma_copy)
             # tensorization
             s[y_gem].reorder(
                 ko,
                 s[y_gem].op.axis[0],
                 s[y_gem].op.axis[1],
                 s[y_gem].op.axis[2],
                 s[y_gem].op.axis[3],
                 ki)
             s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm)
             verify(s, name="default")

         def test_smt():
             # test smt schedule
             s = te.create_schedule(y.op)
             s[x_buf].set_scope(env.inp_scope)
             s[w_buf].set_scope(env.wgt_scope)
             s[y_gem].set_scope(env.acc_scope)
             s[y_shf].set_scope(env.acc_scope)
             s[y_max].set_scope(env.acc_scope)
             s[y_min].set_scope(env.acc_scope)
             abo, aco, abi, aci = s[y].op.axis
             abo1, abo2 = s[y].split(abo, nparts=2)
             s[y].bind(abo1, te.thread_axis("cthread"))
             s[y_gem].compute_at(s[y], abo1)
             s[y_shf].compute_at(s[y], abo1)
             s[y_max].compute_at(s[y], abo1)
             s[y_min].compute_at(s[y], abo1)
             s[y_gem].reorder(
                 ko,
                 s[y_gem].op.axis[0],
                 s[y_gem].op.axis[1],
                 s[y_gem].op.axis[2],
                 s[y_gem].op.axis[3],
                 ki)
             s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm)
             s[y_shf].pragma(s[y_shf].op.axis[0], env.alu)
             s[y_max].pragma(s[y_max].op.axis[0], env.alu)
             s[y_min].pragma(s[y_min].op.axis[0], env.alu)
             s[x_buf].compute_at(s[y_gem], ko)
             s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy)
             s[w_buf].compute_at(s[y_gem], ko)
             s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy)
             s[y].pragma(abo2, env.dma_copy)
             verify(s, name="smt")

         test_schedule1()
         test_smt()
     vta.testing.run(_run)


 def test_alu():
     def _run(env, remote):
         def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None):
             """Test ALU"""
             m = 8
             n = 8
             imm = np.random.randint(1,5)
             # compute
             a = te.placeholder(
                 (m, n, env.BATCH, env.BLOCK_OUT),
                 name="a",
                 dtype=env.acc_dtype)
             a_buf = te.compute(
                 (m, n, env.BATCH, env.BLOCK_OUT),
                 lambda *i: a(*i),
                 "a_buf") #DRAM->SRAM
             if use_imm:
                 res_buf = te.compute(
                     (m, n, env.BATCH, env.BLOCK_OUT),
                     lambda *i: tvm_op(a_buf(*i), imm),
                     "res_buf") #compute
             else:
                 b = te.placeholder(
                     (m, n, env.BATCH, env.BLOCK_OUT),
                     name="b",
                     dtype=env.acc_dtype)
                 b_buf = te.compute(
                     (m, n, env.BATCH, env.BLOCK_OUT),
                     lambda *i: b(*i),
                     "b_buf") #DRAM->SRAM
                 res_buf = te.compute(
                     (m, n, env.BATCH, env.BLOCK_OUT),
                     lambda *i: tvm_op(a_buf(*i), b_buf(*i)),
                     "res_buf") #compute5B
             res = te.compute(
                 (m, n, env.BATCH, env.BLOCK_OUT),
                 lambda *i: res_buf(*i).astype(env.inp_dtype),
                 "res") #SRAM->DRAM
             # schedule
             s = te.create_schedule(res.op)
             s[a_buf].set_scope(env.acc_scope) # SRAM
             s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM
             s[res_buf].set_scope(env.acc_scope) # SRAM
             s[res_buf].pragma(res_buf.op.axis[0], env.alu) # compute
             s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM
             if not use_imm:
                 s[b_buf].set_scope(env.acc_scope) # SRAM
                 s[b_buf].pragma(b_buf.op.axis[0], env.dma_copy) # DRAM->SRAM

             if not remote:
                 return

             # build
             with vta.build_config():
                 if use_imm:
                     mod = vta.build(s, [a, res], "ext_dev", env.target_host)
                 else:
                     mod = vta.build(s, [a, b, res], "ext_dev", env.target_host)
             temp = util.tempdir()
             mod.save(temp.relpath("load_act.o"))
             remote.upload(temp.relpath("load_act.o"))
             f = remote.load_module("load_act.o")
             # verify
             ctx = remote.ext_dev(0)
             a_np = np.random.randint(
                 -16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
             if use_imm:
                 res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm)
             else:
                 b_np = np.random.randint(
                     -16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(b.dtype)
                 res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np)
             res_np = res_np.astype(res.dtype)
             a_nd = tvm.nd.array(a_np, ctx)
             res_nd = tvm.nd.array(
                 np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)

             if env.TARGET in ["sim", "tsim"]:
                 simulator.clear_stats()

             if use_imm:
                 f(a_nd, res_nd)
             else:
                 b_nd = tvm.nd.array(b_np, ctx)
                 f(a_nd, b_nd, res_nd)

             np.testing.assert_equal(res_np, res_nd.asnumpy())

             if env.TARGET in ["sim", "tsim"]:
                 sim_stats = simulator.stats()
                 print("ALU {} execution statistics:".format(test_name))
                 for k, v in sim_stats.items():
                     print("\t{:<16}: {:>16}".format(k, v))

         check_alu(lambda x, y: x << y, np.left_shift, use_imm=True, test_name="SHL")
         check_alu(tvm.te.max, np.maximum, use_imm=True, test_name="MAX")
         check_alu(tvm.te.max, np.maximum, test_name="MAX")
         check_alu(lambda x, y: x + y, use_imm=True, test_name="ADD")
         check_alu(lambda x, y: x + y, test_name="ADD")
         check_alu(lambda x, y: x >> y, np.right_shift, use_imm=True, test_name="SHR")

     vta.testing.run(_run)


 def test_relu():
     """Test RELU on ALU"""
     def _run(env, remote):
         m = 8
         n = 10
         # compute
         a = te.placeholder(
             (m, n, env.BATCH, env.BLOCK_OUT),
             name="a",
             dtype=env.acc_dtype)
         a_buf = te.compute(
             (m, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: a(*i),
             "a_buf") # DRAM->SRAM
         max_buf = te.compute(
             (m, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: tvm.te.max(a_buf(*i), 0),
             "res_buf") # relu
         min_buf = te.compute(
             (m, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: tvm.te.min(max_buf(*i), (1<<(env.INP_WIDTH-1))-1),
             "max_buf") # relu
         res = te.compute(
             (m, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: min_buf(*i).astype(env.inp_dtype),
             "min_buf") # SRAM->DRAM
         # schedule
         s = te.create_schedule(res.op)
         s[a_buf].set_scope(env.acc_scope) # SRAM
         s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM
         s[max_buf].set_scope(env.acc_scope) # SRAM
         s[min_buf].set_scope(env.acc_scope) # SRAM
         s[max_buf].pragma(max_buf.op.axis[0], env.alu) # compute
         s[min_buf].pragma(min_buf.op.axis[0], env.alu) # compute
         s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM
         # build
         with vta.build_config():
             mod = vta.build(s, [a, res], "ext_dev", env.target_host)
         if not remote:
             return
         temp = util.tempdir()
         mod.save(temp.relpath("load_act.o"))
         remote.upload(temp.relpath("load_act.o"))
         f = remote.load_module("load_act.o")
         # verify
         ctx = remote.ext_dev(0)
         a_np = np.random.randint(
             -256, 256, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
         res_np = np.clip(a_np, 0, (1<<(env.INP_WIDTH-1))-1).astype(res.dtype)
         a_nd = tvm.nd.array(a_np, ctx)
         res_nd = tvm.nd.array(
             np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)

         if env.TARGET in ["sim", "tsim"]:
             simulator.clear_stats()

         f(a_nd, res_nd)

         np.testing.assert_equal(res_np, res_nd.asnumpy())

         if env.TARGET in ["sim", "tsim"]:
             sim_stats = simulator.stats()
             print("Relu execution statistics:")
             for k, v in sim_stats.items():
                 print("\t{:<16}: {:>16}".format(k, v))

     vta.testing.run(_run)


 def test_shift_and_scale():
     """Test shift and scale on ALU"""
     def _run(env, remote):
         m = 2
         n = 8
         imm_shift = np.random.randint(0,8)
         imm_scale = np.random.randint(1,5)
         # compute
         a = te.placeholder(
             (m, n, env.BATCH, env.BLOCK_OUT),
             name="a", dtype=env.acc_dtype)
         a_buf = te.compute(
             (m, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: a(*i),
             "a_buf") # DRAM->SRAM
         res_shift = te.compute(
             (m, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: a_buf(*i)+imm_shift,
             "res_shift") # compute
         res_scale = te.compute(
             (m, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: res_shift(*i)>>imm_scale,
             "res_scale") # compute
         res = te.compute(
             (m, n, env.BATCH, env.BLOCK_OUT),
             lambda *i: res_scale(*i).astype(env.inp_dtype),
             "res") # SRAM->DRAM
         # schedule
         s = te.create_schedule(res.op)
         s[a_buf].set_scope(env.acc_scope) # SRAM
         s[res_shift].set_scope(env.acc_scope) # SRAM
         s[res_scale].set_scope(env.acc_scope) # SRAM
         s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM
         s[res_shift].pragma(res_shift.op.axis[0], env.alu) # compute
         s[res_scale].pragma(res_scale.op.axis[0], env.alu) # compute
         s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM
         # build
         mod = vta.build(s, [a, res], "ext_dev", env.target_host)
         if not remote:
             return
         temp = util.tempdir()
         mod.save(temp.relpath("load_act.o"))
         remote.upload(temp.relpath("load_act.o"))
         f = remote.load_module("load_act.o")
         # verify
         ctx = remote.ext_dev(0)
         a_np = np.random.randint(
             -10, 10, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
         res_np = np.right_shift((a_np + imm_shift), imm_scale)
         res_np = res_np.astype(res.dtype)
         a_nd = tvm.nd.array(a_np, ctx)
         res_nd = tvm.nd.array(
             np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)

         if env.TARGET in ["sim", "tsim"]:
             simulator.clear_stats()

         f(a_nd, res_nd)

         np.testing.assert_equal(res_np, res_nd.asnumpy())

         if env.TARGET in ["sim", "tsim"]:
             sim_stats = simulator.stats()
             print("Shift and scale execution statistics:")
             for k, v in sim_stats.items():
                 print("\t{:<16}: {:>16}".format(k, v))

     vta.testing.run(_run)


 def test_runtime_array():
     def _run(env, remote):
         n = 100
         ctx = remote.ext_dev(0)
         x_np = np.random.randint(
             1, 10, size=(n, n, env.BATCH, env.BLOCK_OUT)).astype("int8")
         x_nd = tvm.nd.array(x_np, ctx)
         np.testing.assert_equal(x_np, x_nd.asnumpy())

     vta.testing.run(_run)


 if __name__ == "__main__":
     test_runtime_array()
     test_save_load_out()
     test_padded_load()
     test_gemm()
     test_alu()
     test_relu()
     test_shift_and_scale()
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	"""Unit test VTA's instructions """
	import tvm
	from tvm import te
	import numpy as np
	import topi
	from tvm.contrib import util

	import vta
	import vta.testing
	from vta.testing import simulator

	np.random.seed(0xdeadb)

	def test_save_load_out():
	"""Test save/store output command"""
	def _run(env, remote):
	n = 6
	x = te.placeholder(
	(n, n, env.BATCH, env.BLOCK_OUT),
	name="x",
	dtype=env.acc_dtype)
	x_buf = te.compute(
	(n, n, env.BATCH, env.BLOCK_OUT),
	lambda i: x(i), "x_buf")
	# insert no-op that won't be optimized away
	y_buf = te.compute(
	(n, n, env.BATCH, env.BLOCK_OUT),
	lambda i: x_buf(i)>>0, "y_buf")
	y = te.compute(
	(n, n, env.BATCH, env.BLOCK_OUT),
	lambda i: y_buf(i).astype(env.inp_dtype), "y")
	# schedule
	s = te.create_schedule(y.op)
	s[x_buf].set_scope(env.acc_scope)
	s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy)
	s[y_buf].set_scope(env.acc_scope)
	s[y_buf].pragma(y_buf.op.axis[0], env.alu)
	s[y].pragma(y.op.axis[0], env.dma_copy)

	# verification
	with vta.build_config():
	m = vta.build(s, [x, y], "ext_dev", env.target_host)

	if not remote:
	return
	temp = util.tempdir()
	m.save(temp.relpath("load_act.o"))
	remote.upload(temp.relpath("load_act.o"))
	f = remote.load_module("load_act.o")
	# verify
	ctx = remote.ext_dev(0)
	x_np = np.random.randint(
	1, 10, size=(n, n, env.BATCH, env.BLOCK_OUT)).astype(x.dtype)
	y_np = x_np.astype(y.dtype)
	x_nd = tvm.nd.array(x_np, ctx)
	y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype)

	if env.TARGET in ["sim", "tsim"]:
	simulator.clear_stats()

	f(x_nd, y_nd)

	np.testing.assert_equal(y_np, y_nd.asnumpy())

	if env.TARGET in ["sim", "tsim"]:
	sim_stats = simulator.stats()
	print("Save load execution statistics:")
	for k, v in sim_stats.items():
	print("\t{:<16}: {:>16}".format(k, v))

	vta.testing.run(_run)


	def test_padded_load():
	"""Test padded load."""
	def _run(env, remote):
	def check_padded_load(pad_before, pad_after, test_name=None):
	# declare
	n = 3
	m = 5
	x = te.placeholder(
	(n, m, env.BATCH, env.BLOCK_OUT),
	name="x",
	dtype=env.acc_dtype)
	x_buf = topi.nn.pad(x, pad_before, pad_after, name="y")
	# insert no-op that won't be optimized away
	y_buf = te.compute((n + pad_before[0] + pad_after[0],
	m + pad_before[1] + pad_after[1],
	env.BATCH,
	env.BLOCK_OUT), lambda i: x_buf(i)>>0, "y_buf")
	y = te.compute((n + pad_before[0] + pad_after[0],
	m + pad_before[1] + pad_after[1],
	env.BATCH,
	env.BLOCK_OUT), lambda i: y_buf(i).astype(env.inp_dtype), "y")
	# schedule
	s = te.create_schedule(y.op)
	s[x_buf].set_scope(env.acc_scope)
	s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy)
	s[y_buf].set_scope(env.acc_scope)
	s[y_buf].pragma(y_buf.op.axis[0], env.alu)
	s[y].pragma(y.op.axis[0], env.dma_copy)
	# build
	with vta.build_config():
	mod = vta.build(s, [x, y], "ext_dev", env.target_host)

	if not remote:
	return
	temp = util.tempdir()
	mod.save(temp.relpath("padded_load.o"))
	remote.upload(temp.relpath("padded_load.o"))
	f = remote.load_module("padded_load.o")
	# verify
	ctx = remote.ext_dev(0)
	x_np = np.random.randint(0, 10, size=(
	n, m, env.BATCH, env.BLOCK_OUT)).astype(x.dtype)
	y_np = np.zeros((n + pad_before[0] + pad_after[0],
	m + pad_before[1] + pad_after[1],
	env.BATCH,
	env.BLOCK_OUT)).astype(y.dtype)
	y_np[pad_before[0]:pad_before[0] + n,
	pad_before[1]:pad_before[1] + m,
	:] = x_np
	x_nd = tvm.nd.array(x_np, ctx)
	y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype)

	if env.TARGET in ["sim", "tsim"]:
	simulator.clear_stats()

	f(x_nd, y_nd)

	np.testing.assert_equal(y_np, y_nd.asnumpy())

	if env.TARGET in ["sim", "tsim"]:
	sim_stats = simulator.stats()
	print("Padded {} load execution statistics:".format(test_name))
	for k, v in sim_stats.items():
	print("\t{:<16}: {:>16}".format(k, v))

	check_padded_load([2, 0, 0, 0], [0, 0, 0, 0], test_name="Y0")
	check_padded_load([0, 2, 0, 0], [0, 0, 0, 0], test_name="Y1")
	check_padded_load([0, 0, 0, 0], [2, 0, 0, 0], test_name="X0")
	check_padded_load([0, 0, 0, 0], [0, 2, 0, 0], test_name="X1")
	check_padded_load([1, 1, 0, 0], [1, 1, 0, 0], test_name="all")

	vta.testing.run(_run)


	def test_gemm():
	"""Test GEMM."""
	def _run(env, remote):
	# declare
	o = 4
	n = 1
	m = 4
	x = te.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="x", dtype=env.inp_dtype)
	w = te.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="w", dtype=env.wgt_dtype)
	x_buf = te.compute((o, n, env.BATCH, env.BLOCK_IN), lambda i: x(i), "x_buf")
	w_buf = te.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda i: w(i), "w_buf")
	ko = te.reduce_axis((0, n), name="ko")
	ki = te.reduce_axis((0, env.BLOCK_IN), name="ki")
	y_gem = te.compute(
	(o, m, env.BATCH, env.BLOCK_OUT),
	lambda bo, co, bi, ci:
	te.sum(x_buf[bo, ko, bi, ki].astype(env.acc_dtype) *
	w_buf[co, ko, ci, ki].astype(env.acc_dtype),
	axis=[ko, ki]),
	name="y_gem")
	y_shf = te.compute(
	(o, m, env.BATCH, env.BLOCK_OUT),
	lambda i: y_gem(i)>>8,
	name="y_shf")
	y_max = te.compute(
	(o, m, env.BATCH, env.BLOCK_OUT),
	lambda i: tvm.te.max(y_shf(i), 0),
	"y_max") #relu
	y_min = te.compute(
	(o, m, env.BATCH, env.BLOCK_OUT),
	lambda i: tvm.te.min(y_max(i), (1<<(env.INP_WIDTH-1))-1),
	"y_min") #relu
	y = te.compute(
	(o, m, env.BATCH, env.BLOCK_OUT),
	lambda i: y_min(i).astype(env.inp_dtype),
	name="y")

	if not remote:
	return

	def verify(s, name=None):
	mod = vta.build(s, [x, w, y], "ext_dev", env.target_host)
	temp = util.tempdir()
	mod.save(temp.relpath("gemm.o"))
	remote.upload(temp.relpath("gemm.o"))
	f = remote.load_module("gemm.o")
	# verify
	ctx = remote.ext_dev(0)
	x_np = np.random.randint(
	-128, 128, size=(o, n, env.BATCH, env.BLOCK_IN)).astype(x.dtype)
	w_np = np.random.randint(
	-128, 128, size=(m, n, env.BLOCK_OUT, env.BLOCK_IN)).astype(w.dtype)
	y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype)
	x_nd = tvm.nd.array(x_np, ctx)
	w_nd = tvm.nd.array(w_np, ctx)
	y_nd = tvm.nd.array(y_np, ctx)
	y_np = y_np.astype(env.acc_dtype)
	for b in range(o):
	for i in range(m):
	for j in range(n):
	y_np[b,i,:] += np.dot(x_np[b,j,:].astype(env.acc_dtype),
	w_np[i,j].T.astype(env.acc_dtype))
	y_np = np.right_shift(y_np, 8)
	y_np = np.clip(y_np, 0, (1<<(env.INP_WIDTH-1))-1).astype(y.dtype)

	if env.TARGET in ["sim", "tsim"]:
	simulator.clear_stats()

	f(x_nd, w_nd, y_nd)

	np.testing.assert_equal(y_np, y_nd.asnumpy())

	if env.TARGET in ["sim", "tsim"]:
	sim_stats = simulator.stats()
	print("GEMM schedule:{} execution statistics:".format(name))
	for k, v in sim_stats.items():
	print("\t{:<16}: {:>16}".format(k, v))

	def test_schedule1():
	# default schedule with no smt
	s = te.create_schedule(y.op)
	# set the scope of the SRAM buffers
	s[x_buf].set_scope(env.inp_scope)
	s[w_buf].set_scope(env.wgt_scope)
	s[y_gem].set_scope(env.acc_scope)
	s[y_shf].set_scope(env.acc_scope)
	s[y_max].set_scope(env.acc_scope)
	s[y_min].set_scope(env.acc_scope)
	# set pragmas for DMA transfer and ALU ops
	s[x_buf].compute_at(s[y_gem], ko)
	s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy)
	s[w_buf].compute_at(s[y_gem], ko)
	s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy)
	s[y_shf].pragma(s[y_shf].op.axis[0], env.alu)
	s[y_max].pragma(s[y_max].op.axis[0], env.alu)
	s[y_min].pragma(s[y_min].op.axis[0], env.alu)
	s[y].pragma(s[y].op.axis[0], env.dma_copy)
	# tensorization
	s[y_gem].reorder(
	ko,
	s[y_gem].op.axis[0],
	s[y_gem].op.axis[1],
	s[y_gem].op.axis[2],
	s[y_gem].op.axis[3],
	ki)
	s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm)
	verify(s, name="default")

	def test_smt():
	# test smt schedule
	s = te.create_schedule(y.op)
	s[x_buf].set_scope(env.inp_scope)
	s[w_buf].set_scope(env.wgt_scope)
	s[y_gem].set_scope(env.acc_scope)
	s[y_shf].set_scope(env.acc_scope)
	s[y_max].set_scope(env.acc_scope)
	s[y_min].set_scope(env.acc_scope)
	abo, aco, abi, aci = s[y].op.axis
	abo1, abo2 = s[y].split(abo, nparts=2)
	s[y].bind(abo1, te.thread_axis("cthread"))
	s[y_gem].compute_at(s[y], abo1)
	s[y_shf].compute_at(s[y], abo1)
	s[y_max].compute_at(s[y], abo1)
	s[y_min].compute_at(s[y], abo1)
	s[y_gem].reorder(
	ko,
	s[y_gem].op.axis[0],
	s[y_gem].op.axis[1],
	s[y_gem].op.axis[2],
	s[y_gem].op.axis[3],
	ki)
	s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm)
	s[y_shf].pragma(s[y_shf].op.axis[0], env.alu)
	s[y_max].pragma(s[y_max].op.axis[0], env.alu)
	s[y_min].pragma(s[y_min].op.axis[0], env.alu)
	s[x_buf].compute_at(s[y_gem], ko)
	s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy)
	s[w_buf].compute_at(s[y_gem], ko)
	s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy)
	s[y].pragma(abo2, env.dma_copy)
	verify(s, name="smt")

	test_schedule1()
	test_smt()
	vta.testing.run(_run)


	def test_alu():
	def _run(env, remote):
	def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None):
	"""Test ALU"""
	m = 8
	n = 8
	imm = np.random.randint(1,5)
	# compute
	a = te.placeholder(
	(m, n, env.BATCH, env.BLOCK_OUT),
	name="a",
	dtype=env.acc_dtype)
	a_buf = te.compute(
	(m, n, env.BATCH, env.BLOCK_OUT),
	lambda i: a(i),
	"a_buf") #DRAM->SRAM
	if use_imm:
	res_buf = te.compute(
	(m, n, env.BATCH, env.BLOCK_OUT),
	lambda i: tvm_op(a_buf(i), imm),
	"res_buf") #compute
	else:
	b = te.placeholder(
	(m, n, env.BATCH, env.BLOCK_OUT),
	name="b",
	dtype=env.acc_dtype)
	b_buf = te.compute(
	(m, n, env.BATCH, env.BLOCK_OUT),
	lambda i: b(i),
	"b_buf") #DRAM->SRAM
	res_buf = te.compute(
	(m, n, env.BATCH, env.BLOCK_OUT),
	lambda i: tvm_op(a_buf(i), b_buf(*i)),
	"res_buf") #compute5B
	res = te.compute(
	(m, n, env.BATCH, env.BLOCK_OUT),
	lambda i: res_buf(i).astype(env.inp_dtype),
	"res") #SRAM->DRAM
	# schedule
	s = te.create_schedule(res.op)
	s[a_buf].set_scope(env.acc_scope) # SRAM
	s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM
	s[res_buf].set_scope(env.acc_scope) # SRAM
	s[res_buf].pragma(res_buf.op.axis[0], env.alu) # compute
	s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM
	if not use_imm:
	s[b_buf].set_scope(env.acc_scope) # SRAM
	s[b_buf].pragma(b_buf.op.axis[0], env.dma_copy) # DRAM->SRAM

	if not remote:
	return

	# build
	with vta.build_config():
	if use_imm:
	mod = vta.build(s, [a, res], "ext_dev", env.target_host)
	else:
	mod = vta.build(s, [a, b, res], "ext_dev", env.target_host)
	temp = util.tempdir()
	mod.save(temp.relpath("load_act.o"))
	remote.upload(temp.relpath("load_act.o"))
	f = remote.load_module("load_act.o")
	# verify
	ctx = remote.ext_dev(0)
	a_np = np.random.randint(
	-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
	if use_imm:
	res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm)
	else:
	b_np = np.random.randint(
	-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(b.dtype)
	res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np)
	res_np = res_np.astype(res.dtype)
	a_nd = tvm.nd.array(a_np, ctx)
	res_nd = tvm.nd.array(
	np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)

	if env.TARGET in ["sim", "tsim"]:
	simulator.clear_stats()

	if use_imm:
	f(a_nd, res_nd)
	else:
	b_nd = tvm.nd.array(b_np, ctx)
	f(a_nd, b_nd, res_nd)

	np.testing.assert_equal(res_np, res_nd.asnumpy())

	if env.TARGET in ["sim", "tsim"]:
	sim_stats = simulator.stats()
	print("ALU {} execution statistics:".format(test_name))
	for k, v in sim_stats.items():
	print("\t{:<16}: {:>16}".format(k, v))

	check_alu(lambda x, y: x << y, np.left_shift, use_imm=True, test_name="SHL")
	check_alu(tvm.te.max, np.maximum, use_imm=True, test_name="MAX")
	check_alu(tvm.te.max, np.maximum, test_name="MAX")
	check_alu(lambda x, y: x + y, use_imm=True, test_name="ADD")
	check_alu(lambda x, y: x + y, test_name="ADD")
	check_alu(lambda x, y: x >> y, np.right_shift, use_imm=True, test_name="SHR")

	vta.testing.run(_run)


	def test_relu():
	"""Test RELU on ALU"""
	def _run(env, remote):
	m = 8
	n = 10
	# compute
	a = te.placeholder(
	(m, n, env.BATCH, env.BLOCK_OUT),
	name="a",
	dtype=env.acc_dtype)
	a_buf = te.compute(
	(m, n, env.BATCH, env.BLOCK_OUT),
	lambda i: a(i),
	"a_buf") # DRAM->SRAM
	max_buf = te.compute(
	(m, n, env.BATCH, env.BLOCK_OUT),
	lambda i: tvm.te.max(a_buf(i), 0),
	"res_buf") # relu
	min_buf = te.compute(
	(m, n, env.BATCH, env.BLOCK_OUT),
	lambda i: tvm.te.min(max_buf(i), (1<<(env.INP_WIDTH-1))-1),
	"max_buf") # relu
	res = te.compute(
	(m, n, env.BATCH, env.BLOCK_OUT),
	lambda i: min_buf(i).astype(env.inp_dtype),
	"min_buf") # SRAM->DRAM
	# schedule
	s = te.create_schedule(res.op)
	s[a_buf].set_scope(env.acc_scope) # SRAM
	s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM
	s[max_buf].set_scope(env.acc_scope) # SRAM
	s[min_buf].set_scope(env.acc_scope) # SRAM
	s[max_buf].pragma(max_buf.op.axis[0], env.alu) # compute
	s[min_buf].pragma(min_buf.op.axis[0], env.alu) # compute
	s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM
	# build
	with vta.build_config():
	mod = vta.build(s, [a, res], "ext_dev", env.target_host)
	if not remote:
	return
	temp = util.tempdir()
	mod.save(temp.relpath("load_act.o"))
	remote.upload(temp.relpath("load_act.o"))
	f = remote.load_module("load_act.o")
	# verify
	ctx = remote.ext_dev(0)
	a_np = np.random.randint(
	-256, 256, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
	res_np = np.clip(a_np, 0, (1<<(env.INP_WIDTH-1))-1).astype(res.dtype)
	a_nd = tvm.nd.array(a_np, ctx)
	res_nd = tvm.nd.array(
	np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)

	if env.TARGET in ["sim", "tsim"]:
	simulator.clear_stats()

	f(a_nd, res_nd)

	np.testing.assert_equal(res_np, res_nd.asnumpy())

	if env.TARGET in ["sim", "tsim"]:
	sim_stats = simulator.stats()
	print("Relu execution statistics:")
	for k, v in sim_stats.items():
	print("\t{:<16}: {:>16}".format(k, v))

	vta.testing.run(_run)


	def test_shift_and_scale():
	"""Test shift and scale on ALU"""
	def _run(env, remote):
	m = 2
	n = 8
	imm_shift = np.random.randint(0,8)
	imm_scale = np.random.randint(1,5)
	# compute
	a = te.placeholder(
	(m, n, env.BATCH, env.BLOCK_OUT),
	name="a", dtype=env.acc_dtype)
	a_buf = te.compute(
	(m, n, env.BATCH, env.BLOCK_OUT),
	lambda i: a(i),
	"a_buf") # DRAM->SRAM
	res_shift = te.compute(
	(m, n, env.BATCH, env.BLOCK_OUT),
	lambda i: a_buf(i)+imm_shift,
	"res_shift") # compute
	res_scale = te.compute(
	(m, n, env.BATCH, env.BLOCK_OUT),
	lambda i: res_shift(i)>>imm_scale,
	"res_scale") # compute
	res = te.compute(
	(m, n, env.BATCH, env.BLOCK_OUT),
	lambda i: res_scale(i).astype(env.inp_dtype),
	"res") # SRAM->DRAM
	# schedule
	s = te.create_schedule(res.op)
	s[a_buf].set_scope(env.acc_scope) # SRAM
	s[res_shift].set_scope(env.acc_scope) # SRAM
	s[res_scale].set_scope(env.acc_scope) # SRAM
	s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM
	s[res_shift].pragma(res_shift.op.axis[0], env.alu) # compute
	s[res_scale].pragma(res_scale.op.axis[0], env.alu) # compute
	s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM
	# build
	mod = vta.build(s, [a, res], "ext_dev", env.target_host)
	if not remote:
	return
	temp = util.tempdir()
	mod.save(temp.relpath("load_act.o"))
	remote.upload(temp.relpath("load_act.o"))
	f = remote.load_module("load_act.o")
	# verify
	ctx = remote.ext_dev(0)
	a_np = np.random.randint(
	-10, 10, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
	res_np = np.right_shift((a_np + imm_shift), imm_scale)
	res_np = res_np.astype(res.dtype)
	a_nd = tvm.nd.array(a_np, ctx)
	res_nd = tvm.nd.array(
	np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)

	if env.TARGET in ["sim", "tsim"]:
	simulator.clear_stats()

	f(a_nd, res_nd)

	np.testing.assert_equal(res_np, res_nd.asnumpy())

	if env.TARGET in ["sim", "tsim"]:
	sim_stats = simulator.stats()
	print("Shift and scale execution statistics:")
	for k, v in sim_stats.items():
	print("\t{:<16}: {:>16}".format(k, v))

	vta.testing.run(_run)


	def test_runtime_array():
	def _run(env, remote):
	n = 100
	ctx = remote.ext_dev(0)
	x_np = np.random.randint(
	1, 10, size=(n, n, env.BATCH, env.BLOCK_OUT)).astype("int8")
	x_nd = tvm.nd.array(x_np, ctx)
	np.testing.assert_equal(x_np, x_nd.asnumpy())

	vta.testing.run(_run)


	if __name__ == "__main__":
	test_runtime_array()
	test_save_load_out()
	test_padded_load()
	test_gemm()
	test_alu()
	test_relu()
	test_shift_and_scale()