Chisel Pipelined GEMM (#30)

* Reset to 644 file permissions

* Add json files to src/test/resources for testing

* Add new TensorGemmPipelinedSplit module and rename existing TensorGemm to TensorGemmOrig

* Tests for TensorGemmPipelinedSplit, TensorGemmOrig, and associated submodules

* Add jackson plugin dependency and stricter Scala checks

* Remove debug prints

* Rename x.json and y.json to gemm_1uop_overflow_offset.json and gemm_2uop_overflow_cascaded.json respectively

* All occurrences of '\( ' replaced with '\('

* Add linting rule to flag spaces after lparen characters

* Remove comment

* Rename TensorGemmOrig to TensorGemmSimple
diff --git a/hardware/chisel/build.sbt b/hardware/chisel/build.sbt
index 7efd59d..851f5ab 100644
--- a/hardware/chisel/build.sbt
+++ b/hardware/chisel/build.sbt
@@ -68,5 +68,13 @@
 libraryDependencies ++= Seq("chisel3","chisel-iotesters").map {
   dep: String => "edu.berkeley.cs" %% dep % sys.props.getOrElse(dep + "Version", defaultVersions(dep)) }
 
+libraryDependencies ++= Seq(
+  "com.fasterxml.jackson.core" % "jackson-databind" % "2.10.3",
+  "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.10.3"
+)
+
+scalacOptions += "-language:reflectiveCalls"
+scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings")
+
 scalacOptions ++= scalacOptionsVersion(scalaVersion.value)
 javacOptions ++= javacOptionsVersion(scalaVersion.value)
diff --git a/hardware/chisel/scalastyle-config.xml b/hardware/chisel/scalastyle-config.xml
index 1252900..89196be 100644
--- a/hardware/chisel/scalastyle-config.xml
+++ b/hardware/chisel/scalastyle-config.xml
@@ -71,6 +71,11 @@
  </check>
  <check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="true"></check>
  <check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.scalariform.DisallowSpaceAfterTokenChecker" enabled="true">
+  <parameters>
+   <parameter name="tokens">LPAREN</parameter>
+  </parameters>
+ </check>
  <check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="true"></check>
  <check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="true"></check>
  <check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="true"></check>
diff --git a/hardware/chisel/src/main/scala/core/TensorGemm.scala b/hardware/chisel/src/main/scala/core/TensorGemm.scala
index e977552..f63de94 100644
--- a/hardware/chisel/src/main/scala/core/TensorGemm.scala
+++ b/hardware/chisel/src/main/scala/core/TensorGemm.scala
@@ -21,12 +21,11 @@
 
 import chisel3._
 import chisel3.util._
-import chisel3.experimental._
 import vta.util.config._
 import scala.math.pow
 
 /** Pipelined multiply and accumulate */
-class MAC(aBits: Int = 8, bBits: Int = 8, cBits: Int = 16) extends Module {
+class MAC(aBits: Int = 8, bBits: Int = 8, cBits: Int = 16, flopIn: Boolean = false) extends Module {
   val outBits = Math.max(aBits + bBits, cBits) + 1
   val io = IO(new Bundle {
     val a = Input(SInt(aBits.W))
@@ -34,16 +33,15 @@
     val c = Input(SInt(cBits.W))
     val y = Output(SInt(outBits.W))
   })
+
   val mult = Wire(SInt((aBits + bBits).W))
-  val add = Wire(SInt(outBits.W))
-  val rA = RegNext(io.a)
-  val rB = RegNext(io.b)
-  val rC = RegNext(io.c)
+  val rA = if (flopIn) RegNext(io.a) else io.a
+  val rB = if (flopIn) RegNext(io.b) else io.b
+  val rC = if (flopIn) RegNext(io.c) else io.c
 
   mult := rA * rB
-  add := rC +& mult
-
-  io.y := add
+  val addV = if (flopIn) {rC +& mult} else {RegNext(rC +& mult)}
+  io.y := addV
 }
 
 /** PipeAdder
@@ -86,28 +84,31 @@
 }
 
 /** Pipelined DotProduct based on MAC and PipeAdder */
-class DotProduct(aBits: Int = 8, bBits: Int = 8, size: Int = 16) extends Module {
+class DotProduct(aBits: Int = 8, bBits: Int = 8, blockIn: Int = 16) extends Module {
   val errorMsg =
     s"\n\n[VTA] [DotProduct] size must be greater than 4 and a power of 2\n\n"
-  require(size >= 2 && isPow2(size), errorMsg)
+  require(blockIn >= 2 && isPow2(blockIn), errorMsg)
   val b = aBits + bBits
-  val outBits = b + log2Ceil(size) + 1
+  val outBits = b + log2Ceil(blockIn) + 1
   val io = IO(new Bundle {
-    val a = Input(Vec(size, SInt(aBits.W)))
-    val b = Input(Vec(size, SInt(bBits.W)))
+    val a = Input(Vec(blockIn, SInt(aBits.W)))
+    val b = Input(Vec(blockIn, SInt(bBits.W)))
     val y = Output(SInt(outBits.W))
   })
-  val s = Seq.tabulate(log2Ceil(size + 1))(i =>
-    pow(2, log2Ceil(size) - i).toInt) // # of total layers
-  val p = log2Ceil(size / 2) + 1 // # of adder layers
-  val m = Seq.fill(s(0))(Module(new MAC(aBits, bBits, cBits = 1))) // # of total vector pairs
+  val s = Seq.tabulate(log2Ceil(blockIn + 1))(i =>
+    pow(2, log2Ceil(blockIn) - i).toInt) // # of total layers
+  val p = log2Ceil(blockIn / 2) + 1 // # of adder layers
+  val m = Seq.fill(s(0))(Module(new MAC(aBits, bBits, cBits = 1, flopIn = p < 6))) // # of total vector pairs
   val a = Seq.tabulate(p)(
     i =>
       Seq.fill(s(i + 1))(
-        if (i == 0)
+        if ((i == 0 && p < 4) || (i == p - 2 && p >= 4)) {
           Module(new PipeAdder(aBits = (b + i + 1), bBits = (b + i + 1)))
-        else
-          Module(new Adder(aBits = (b + i + 1), bBits = (b + i + 1))))) // # adders within each layer
+        }
+        else {
+          Module(new Adder(aBits = (b + i + 1), bBits = (b + i + 1)))
+        }
+      )) // # adders within each layer
 
   // Vector MACs
   for (i <- 0 until s(0)) {
@@ -137,7 +138,8 @@
 /** Perform matrix-vector-multiplication based on DotProduct */
 class MatrixVectorMultiplication(implicit p: Parameters) extends Module {
   val accBits = p(CoreKey).accBits
-  val size = p(CoreKey).blockOut
+  val size = p(CoreKey).blockOut / p(CoreKey).blockOutFactor
+  val batch = p(CoreKey).batch
   val inpBits = p(CoreKey).inpBits
   val wgtBits = p(CoreKey).wgtBits
   val outBits = p(CoreKey).outBits
@@ -149,28 +151,149 @@
     val acc_o = new TensorClientData(tensorType = "acc")
     val out = new TensorClientData(tensorType = "out")
   })
-  val dot = Seq.fill(size)(
-    Module(new DotProduct(aBits = inpBits, bBits = wgtBits, size)))
+  val dot = Seq.fill(batch)(Seq.fill(size)(
+    Module(new DotProduct(aBits = inpBits, bBits = wgtBits, size))))
   // Latency is defined as two in the following, because there is one cycle in the MAC module,
   // and another cycle in the pipelined adders as the first layer of the accumulator
-  val acc = Seq.fill(size)(Module(new Pipe(UInt(accBits.W), latency = 2)))
-  val add = Seq.fill(size)(Wire(SInt(accBits.W)))
-  val vld = Wire(Vec(size, Bool()))
+  val acc = Seq.fill(batch)(Seq.fill(size)(Module(new Pipe(UInt(accBits.W), latency = 2))))
+  val add = Seq.fill(batch)(Seq.fill(size)(Wire(SInt(accBits.W))))
+  val vld = Wire(Vec(batch, Vec(size, Bool())))
 
-  for (i <- 0 until size) {
-    acc(i).io.enq.valid := io.inp.data.valid & io.wgt.data.valid & io.acc_i.data.valid & ~io.reset
-    acc(i).io.enq.bits := io.acc_i.data.bits(0)(i)
-    for (j <- 0 until size) {
-      dot(i).io.a(j) := io.inp.data.bits(0)(j).asSInt
-      dot(i).io.b(j) := io.wgt.data.bits(i)(j).asSInt
+  for (b <- 0 until batch) {
+    for (i <- 0 until size) {
+      acc(b)(i).io.enq.valid := io.inp.data.valid & io.wgt.data.valid & io.acc_i.data.valid & ~io.reset
+      acc(b)(i).io.enq.bits := io.acc_i.data.bits(b)(i)
+      for (j <- 0 until size) {
+        dot(b)(i).io.a(j) := io.inp.data.bits(b)(j).asSInt
+        dot(b)(i).io.b(j) := io.wgt.data.bits(i)(j).asSInt // all batches get the same weight - reuse
+      }
+      add(b)(i) := acc(b)(i).io.deq.bits.asSInt + dot(b)(i).io.y
+      io.acc_o.data.bits(b)(i) := Mux(io.reset, 0.U, add(b)(i).asUInt)
+      io.out.data.bits(b)(i) := add(b)(i).asUInt
+      vld(b)(i) := acc(b)(i).io.deq.valid
     }
-    add(i) := acc(i).io.deq.bits.asSInt + dot(i).io.y
-    io.acc_o.data.bits(0)(i) := Mux(io.reset, 0.U, add(i).asUInt)
-    io.out.data.bits(0)(i) := add(i).asUInt
-    vld(i) := acc(i).io.deq.valid
+    io.acc_o.data.valid := vld.asUInt.andR | io.reset
+    io.out.data.valid := vld.asUInt.andR
   }
-  io.acc_o.data.valid := vld.asUInt.andR | io.reset
-  io.out.data.valid := vld.asUInt.andR
+}
+
+/** Perform matrix-vector-multiplication based on DotProduct */
+class MatrixVectorMultiplicationBypass(implicit p: Parameters) extends Module {
+  val accBits = p(CoreKey).accBits
+  val blockOut = p(CoreKey).blockOut / p(CoreKey).blockOutFactor
+  val blockIn = p(CoreKey).blockIn
+  val batch   = p(CoreKey).batch
+  val inpBits = p(CoreKey).inpBits
+  val wgtBits = p(CoreKey).wgtBits
+  val outBits = p(CoreKey).outBits
+  val io = IO(new Bundle {
+    val valid_reset = Input(Bool())
+    val inp = new TensorMasterData(tensorType = "inp")
+    val wgt = new TensorMasterData(tensorType = "wgt")
+    val acc_i = new TensorMasterData(tensorType = "acc")
+    val acc_o = new TensorClientData(tensorType = "acc")
+    val out = new TensorClientData(tensorType = "out")
+    val bypass_cond = Input(Bool())
+  })
+  val dot = Seq.fill(batch)(Seq.fill(blockOut)(
+    Module(new DotProduct(aBits = inpBits, bBits = wgtBits, blockIn))))
+  val add = Seq.fill(batch)(Seq.fill(blockOut)(Wire(SInt(accBits.W))))
+  val last_acc_write = Seq.fill(batch)(Seq.fill(blockOut){Reg(SInt(accBits.W))})
+  io.out.data.bits := DontCare // out is not fully initialized by a single module
+  for (b <- 0 until batch) {
+    for (i <- 0 until blockOut) {
+      for (j <- 0 until blockIn) {
+        dot(b)(i).io.a(j) := io.inp.data.bits(b)(j).asSInt
+        dot(b)(i).io.b(j) := io.wgt.data.bits(i)(j).asSInt
+      }
+      val byp = Mux(io.bypass_cond, last_acc_write(b)(i), io.acc_i.data.bits(b)(i).asSInt)
+      add(b)(i) := byp + dot(b)(i).io.y
+      val tmp = Mux(io.valid_reset, 0.S, add(b)(i))
+      io.acc_o.data.bits(b)(i) := tmp.asUInt
+      last_acc_write(b)(i) := tmp
+      io.out.data.bits(b)(i) := add(b)(i).asUInt
+    }
+  }
+  io.acc_o.data.valid := io.acc_i.data.valid | io.valid_reset
+  io.out.data.valid := io.acc_i.data.valid & ~io.valid_reset
+}
+
+class TensorGemmIndexGenerator(implicit p: Parameters) extends Module {
+  val io = IO(new Bundle {
+    val start = Input(Bool())
+    val last = Output(Bool())
+
+    val dec = Input(new GemmDecode)
+
+    val acc_i = Output(UInt(new TensorParams(tensorType="acc").memAddrBits.W))
+    val inp_i = Output(UInt(new TensorParams(tensorType="inp").memAddrBits.W))
+    val wgt_i = Output(UInt(new TensorParams(tensorType="wgt").memAddrBits.W))
+
+    val uop_idx = Output(UInt(log2Ceil(p(CoreKey).uopMemDepth).W))
+
+    val valid = Output(Bool())
+  })
+
+  io.last := false.B
+
+  val running = RegInit(false.B)
+  when(!running && io.start) {
+    running := true.B
+  }.elsewhen(io.last) {
+    running := false.B
+  }
+
+  val cnt_i = Reg(chiselTypeOf(io.dec.lp_1))
+  val acc_i = Reg(chiselTypeOf(io.acc_i))
+  val inp_i = Reg(chiselTypeOf(io.inp_i))
+  val wgt_i = Reg(chiselTypeOf(io.wgt_i))
+
+  val cnt_o = Reg(chiselTypeOf(io.dec.lp_0))
+  val acc_o = Reg(chiselTypeOf(io.acc_i))
+  val inp_o = Reg(chiselTypeOf(io.inp_i))
+  val wgt_o = Reg(chiselTypeOf(io.wgt_i))
+
+  val uop_idx = Reg(chiselTypeOf(io.dec.uop_end))
+
+  io.valid := running
+  io.acc_i := acc_i
+  io.inp_i := inp_i
+  io.wgt_i := wgt_i
+  io.uop_idx := uop_idx
+
+  when(!running) {
+    cnt_i := 0.U; acc_i := 0.U; inp_i := 0.U; wgt_i := 0.U
+    cnt_o := 0.U; acc_o := 0.U; inp_o := 0.U; wgt_o := 0.U
+    uop_idx := io.dec.uop_begin
+  } .otherwise {
+    when (uop_idx =/= io.dec.uop_end - 1.U) {
+      uop_idx := uop_idx + 1.U
+    }.otherwise {
+      uop_idx := io.dec.uop_begin
+      when (cnt_i =/= io.dec.lp_1 - 1.U) {
+        cnt_i := cnt_i + 1.U
+        acc_i := acc_i + io.dec.acc_1
+        inp_i := inp_i + io.dec.inp_1
+        wgt_i := wgt_i + io.dec.wgt_1
+      }.otherwise {
+        when (cnt_o =/= io.dec.lp_0 - 1.U) {
+          val acc_tmp = acc_o + io.dec.acc_0
+          val inp_tmp = inp_o + io.dec.inp_0
+          val wgt_tmp = wgt_o + io.dec.wgt_0
+          cnt_o := cnt_o + 1.U
+          acc_o := acc_tmp
+          inp_o := inp_tmp
+          wgt_o := wgt_tmp
+          cnt_i := 0.U
+          acc_i := acc_tmp
+          inp_i := inp_tmp
+          wgt_i := wgt_tmp
+        } .otherwise {
+          io.last := true.B
+        }
+      }
+    }
+  }
 }
 
 abstract class TensorGemmIfc(implicit p: Parameters) extends Module {
@@ -190,16 +313,16 @@
   })
 }
 
-/** TensorGemm.
+/** TensorGemmSimple
  *
  * This unit instantiate the MatrixVectorMultiplication and go over the
  * micro-ops (uops) which are used to read inputs, weights and biases,
  * and writes results back to the acc and out scratchpads.
  *
- * Also, the TensorGemm uses the reset field in the Gemm instruction to
+ * Also, TensorGemmSimple uses the reset field in the Gemm instruction to
  * clear or zero-out the acc-scratchpad locations based on the micro-ops.
  */
-class TensorGemm(debug: Boolean = false)(implicit p: Parameters) extends TensorGemmIfc {
+class TensorGemmSimple(debug: Boolean = false)(implicit p: Parameters) extends TensorGemmIfc {
 
   require(p(CoreKey).blockOutFactor == 1,
     "-F- Split GEMM not supported. Use TensorGemmPipelinedSplit or set blockOutFactor to 1")
@@ -227,12 +350,12 @@
   // Latency is defined as two in the following, because there is one cycle in the MAC module,
   // and another cycle in the pipelined adders as the first layer of the accumulator
   val wrpipe = Module(new Pipe(chiselTypeOf(dec.uop_end), latency = 2))
-  val cond_last = cnt_o === dec.lp_0 - 1.U &
+  val cond = cnt_o === dec.lp_0 - 1.U &
     cnt_i === dec.lp_1 - 1.U &
     uop_idx === uop_end - 1.U
 
   val done = inflight === 0.U &
-    ((state === sExe) & cond_last | state === sWait)
+    ((state === sExe) & cond | state === sWait)
 
   switch(state) {
     is(sIdle) {
@@ -250,7 +373,7 @@
       state := sExe
     }
     is(sExe) {
-      when(cond_last) {
+      when(cond) {
         when(inflight =/= 0.U) {
           state := sWait
         }.otherwise {
@@ -421,3 +544,206 @@
     }
   }
 }
+
+class TensorGemmPipelinedSplit (implicit p: Parameters) extends TensorGemmIfc {
+  val sIdle::sRun::sWait::Nil = Enum(3);
+  val numMVMs = p(CoreKey).blockOutFactor
+  val numOuts = p(CoreKey).blockOut / numMVMs
+  require (numOuts > 0, "-F- Cannot factor more groups than blockOut")
+  val batch = p(CoreKey).batch
+
+  val m = Module(new TensorGemmIndexGenerator)
+
+  // additional pipe latency of wgt/inp read if needed
+  val scratchpadReadLatency = 0
+  val inpReadIdxLatency = 0
+  val uopReadLatency = 0
+
+  val delayed_valid = ShiftRegister(m.io.valid, uopReadLatency + 1, resetData = false.B, en = true.B)
+  val delayed_acc_i = ShiftRegister(m.io.acc_i, uopReadLatency + 1)
+  val delayed_inp_i = ShiftRegister(m.io.inp_i, uopReadLatency + 1)
+  val delayed_wgt_i = ShiftRegister(m.io.wgt_i, uopReadLatency + 1)
+
+  val state = RegInit(sIdle)
+  val inflight = RegInit(0.U(inflightBits.W))
+
+  val capture_dec = Reg(chiselTypeOf(io.dec))
+
+  io.done := false.B
+  when(state === sIdle && io.start) {
+    state := sRun
+    capture_dec := io.dec
+    // if (io.dec.empty_0 != None) assert(io.dec.empty_0.get === 0.U)
+    // if (io.dec.empty_1 != None) assert(io.dec.empty_1.get === 0.U)
+  }.elsewhen(state === sRun && m.io.last) {
+    state := sWait
+  }.elsewhen(state === sWait && inflight === 0.U) {
+    state := sIdle
+    io.done := true.B
+  }
+  io.state := state
+
+  assert(state =/= sRun  || capture_dec.asUInt === io.dec.asUInt)
+  assert(state =/= sWait || capture_dec.asUInt === io.dec.asUInt)
+
+  m.io.start := io.start
+
+  m.io.dec := io.dec
+  io.uop.idx.bits := m.io.uop_idx
+  io.uop.idx.valid := m.io.valid
+
+  val delayedUopData = ShiftRegister(io.uop.data, uopReadLatency)
+
+  assert(delayedUopData.valid === delayed_valid)
+
+  val uop_valid = ShiftRegister(delayed_valid, inpReadIdxLatency, resetData = false.B, en = true.B)
+  val uop_acc = ShiftRegister(delayedUopData.bits.u0 + delayed_acc_i, inpReadIdxLatency)
+  val uop_inp =  delayedUopData.bits.u1 + delayed_inp_i // it is piped in inp tensor read
+  val uop_wgt = ShiftRegister(delayedUopData.bits.u2 + delayed_wgt_i, inpReadIdxLatency)
+
+  val reset_pipe = Module(
+    new Pipe(
+      Bool(),
+      latency = 3 /* 1 stage is borrowed down here*/ + scratchpadReadLatency + inpReadIdxLatency + uopReadLatency))
+  reset_pipe.io.enq.valid := m.io.valid
+  reset_pipe.io.enq.bits := capture_dec.reset
+
+  val acc_idx_pipe = Module(
+    new Pipe(chiselTypeOf(io.acc.rd(0).idx.bits), latency= 1 /* borrow 1 stage to split*/ + scratchpadReadLatency))
+  acc_idx_pipe.io.enq.valid := uop_valid
+  acc_idx_pipe.io.enq.bits := uop_acc
+
+  require(io.inp.splitWidth == 1 && io.inp.splitLength == 1, "-F- Input split read not supported")
+  io.inp.rd(0).idx.valid := delayed_valid
+  io.inp.rd(0).idx.bits := uop_inp
+  val delayed_uop_valid = RegNext(uop_valid, init=false.B) // memdelay
+  // asset fires on emulated tensorRead Direct GEMM test TODO: fix memoryManager sram read
+  // it works only for VTA_CORE_GEMM_INP_IDX_PIPE 0
+  assert(io.inp.rd(0).data.valid === delayed_uop_valid)
+  for (idx <- 0 until numMVMs) {
+    io.acc.rd(idx).idx.valid := RegNext(acc_idx_pipe.io.deq.valid, init = false.B)
+    io.acc.rd(idx).idx.bits := RegNext(acc_idx_pipe.io.deq.bits)
+
+    // delay wgt read by input result delay latency
+    io.wgt.rd(idx).idx.valid := ShiftRegister(uop_valid, scratchpadReadLatency)
+    io.wgt.rd(idx).idx.bits := ShiftRegister(uop_wgt, scratchpadReadLatency)
+
+    assert(io.wgt.rd(idx).data.valid === ShiftRegister(delayed_uop_valid, scratchpadReadLatency))
+  }
+  io.wgt.tieoffWrite()
+  io.inp.tieoffWrite()
+
+  // create a pipe of 3+ delay with split by goup last stage
+  // and a separate last stage for out and inflight
+  val wrpipe0 = Module(new Pipe(chiselTypeOf(io.acc.wr(0).bits.idx), latency= 2 + scratchpadReadLatency))
+  wrpipe0.io.enq.valid := uop_valid
+  wrpipe0.io.enq.bits := uop_acc
+  // write pipe not split
+  val wrpipeNs = Module(new Pipe(chiselTypeOf(io.acc.wr(0).bits.idx), latency= 1))
+  wrpipeNs.io.enq <> wrpipe0.io.deq
+  // split the last pipe stage per group
+  val wrpipe =  for (idx <- 0 until numMVMs) yield {
+    val pipe = Module(new Pipe(chiselTypeOf(io.acc.wr(0).bits.idx), latency= 1))
+    pipe.io.enq <> wrpipe0.io.deq
+    pipe
+  }
+
+  for (idx <- 0 until numMVMs) {
+    assert(io.acc.rd(idx).data.valid === wrpipe(idx).io.deq.valid)
+  }
+
+  when(m.io.valid && wrpipeNs.io.deq.valid) {
+  }.elsewhen(m.io.valid) {
+    assert(inflight =/= ((1<<inflightBits)-1).U)
+    inflight := inflight + 1.U
+  }.elsewhen(wrpipeNs.io.deq.valid) {
+    assert(inflight =/= 0.U)
+    inflight := inflight - 1.U
+  }
+  when(state === sIdle) {
+    assert(inflight === 0.U)
+    inflight := 0.U
+  }
+
+  io.inflight := inflight
+
+  val mvmInpRdLatency = if (scratchpadReadLatency == 0) {
+    0
+  } else {
+    scratchpadReadLatency - 1
+  }
+  // split factor of inp data for many groups
+  val splitFactorL0 = pow(2,log2Ceil(numMVMs) / 2).toInt
+  val splitFactorL1 = pow(2,log2Ceil(numMVMs)
+    - log2Ceil(numMVMs) / 2).toInt
+  require(splitFactorL0 * splitFactorL1 == numMVMs)
+  val inpRdData0 = for (idx <- 0 until splitFactorL0) yield {
+    if (scratchpadReadLatency > 0) RegNext(io.inp.rd(0).data) else io.inp.rd(0).data
+  }
+
+  // define MVC groups operating on a subset of acc elements
+  // each MVM generates only a part of acc bits while has whole inteface defined !!!
+  // those bits are lower bits in acc/out interface
+  val mvc = for (idx <- 0 until numMVMs) yield {Module(new MatrixVectorMultiplicationBypass)}
+
+  require(io.out.splitWidth == 1 && io.out.splitLength == 1, "-F- Out split write is not supported")
+  for (idx1 <- 0 until numMVMs) {
+
+    val wrpipe2 = Module(new Pipe(chiselTypeOf(io.acc.wr(0).bits.idx), latency=1))
+    wrpipe2.io.enq := wrpipe(idx1).io.deq
+
+    mvc(idx1).io.bypass_cond :=
+      wrpipe(idx1).io.deq.bits === wrpipe2.io.deq.bits && wrpipe(idx1).io.deq.valid && wrpipe2.io.deq.valid
+
+    // borrow one stage from reset_pipe and split per group
+    mvc(idx1).io.valid_reset := RegNext(reset_pipe.io.deq.bits & reset_pipe.io.deq.valid, init = false.B)
+    // wire to each mvm
+    mvc(idx1).io.inp.data :=
+      ShiftRegister(inpRdData0(idx1/splitFactorL1), mvmInpRdLatency) // delay to deliver over distance
+    mvc(idx1).io.wgt.data := io.wgt.rd(idx1).data // wgt read idx is delayed instead of data
+    mvc(idx1).io.acc_i.data.valid := io.acc.rd(idx1).data.valid
+    assert(mvc(idx1).io.acc_o.data.valid === (wrpipe(idx1).io.deq.valid | mvc(idx1).io.valid_reset))
+    for(accLenIdx <- 0 until mvc(idx1).io.acc_o.lenSplit) {
+      for(accWdtIdx <- 0 until mvc(idx1).io.acc_o.widthSplit) {
+        val (gemmGrpIdx, gemmLenIdx, gemmWdtIdx) =
+          mvc(idx1).io.acc_o.reindexDataToGroup(idx1, accLenIdx, accWdtIdx)
+        mvc(gemmGrpIdx).io.acc_i.data.bits(gemmLenIdx)(gemmWdtIdx) :=
+          io.acc.rd(idx1).data.bits(accLenIdx)(accWdtIdx)
+      }
+    }
+
+    for(gemmLenIdx <- 0 until mvc(idx1).io.acc_o.lenSplit) {
+      for(gemmWdtIdx <- 0 until mvc(idx1).io.acc_o.widthSplit) {
+        val (accGrpIdx, accLenIdx, accWdtIdx) =
+          mvc(idx1).io.acc_o.reindexDataFromGroup(idx1, gemmLenIdx, gemmWdtIdx)
+        io.acc.wr(accGrpIdx).bits.data(accLenIdx)(accWdtIdx) :=
+          mvc(idx1).io.acc_o.data.bits(gemmLenIdx)(gemmWdtIdx)
+      }
+    }
+
+    io.acc.wr(idx1).valid := wrpipe(idx1).io.deq.valid
+    io.acc.wr(idx1).bits.idx := wrpipe(idx1).io.deq.bits
+  }
+// comment to split write out
+  if (numMVMs > 1) {
+    for (idx1 <- 1 until numMVMs) {
+      assert(mvc(idx1).io.out.data.valid === mvc(idx1 - 1).io.out.data.valid,
+        "-F- Out split write is not supported")
+    }
+  }
+  val outData = Wire(io.out.wr(0).bits.data.cloneType)
+  for (idx3 <- 0 until numMVMs) {
+    for (idx1 <- 0 until io.out.tensorLength) {
+      for (idx2 <- 0 until io.out.tensorWidth/numMVMs) {
+        outData(idx1)(idx3*io.out.tensorWidth/numMVMs + idx2) := mvc(idx3).io.out.data.bits(idx1)(idx2)
+      }
+    }
+  }
+  io.out.wr(0).bits.data := outData
+  io.out.wr(0).valid := wrpipeNs.io.deq.valid && mvc(io.acc.closestIOGrpIdx).io.out.data.valid
+  io.out.wr(0).bits.idx := wrpipeNs.io.deq.bits
+
+  io.out.tieoffRead()
+}
+
+class TensorGemm(implicit val p: Parameters) extends TensorGemmPipelinedSplit
diff --git a/hardware/chisel/src/test/resources/.gitignore b/hardware/chisel/src/test/resources/.gitignore
new file mode 100644
index 0000000..0521c5f
--- /dev/null
+++ b/hardware/chisel/src/test/resources/.gitignore
@@ -0,0 +1 @@
+!*.json
diff --git a/hardware/chisel/src/test/resources/gemm_1uop_overflow_offset.json b/hardware/chisel/src/test/resources/gemm_1uop_overflow_offset.json
new file mode 100644
index 0000000..05bc0b1
--- /dev/null
+++ b/hardware/chisel/src/test/resources/gemm_1uop_overflow_offset.json
@@ -0,0 +1,188 @@
+{
+  "inst": {
+    "reset": "0",
+    "uop_begin": "0001",
+    "uop_end": "0002",
+    "lp_0": "0001",
+    "lp_1": "0001",
+    "acc_0": "000",
+    "acc_1": "000",
+    "inp_0": "000",
+    "inp_1": "000",
+    "wgt_0": "000",
+    "wgt_1": "000"
+  },
+  "inp": [
+    {
+      "idx": "00000000",
+      "vec": [
+        "01",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00"
+      ]
+    },
+    {
+      "idx": "00000001",
+      "vec": [
+        "00",
+        "01",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00"
+      ]
+    }
+  ],
+  "wgt": [
+    {
+      "idx": "00000000",
+      "vec": [
+        "00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "0a", "0b", "0c", "0d", "0e", "0f",
+        "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "1a", "1b", "1c", "1d", "1e", "1f",
+        "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "2a", "2b", "2c", "2d", "2e", "2f",
+        "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "3a", "3b", "3c", "3d", "3e", "3f",
+        "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "4a", "4b", "4c", "4d", "4e", "4f",
+        "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "5a", "5b", "5c", "5d", "5e", "5f",
+        "60", "61", "62", "63", "64", "65", "66", "67", "68", "69", "6a", "6b", "6c", "6d", "6e", "6f",
+        "70", "71", "72", "73", "74", "75", "76", "77", "78", "79", "7a", "7b", "7c", "7d", "7e", "7f",
+        "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "8a", "8b", "8c", "8d", "8e", "8f",
+        "90", "91", "92", "93", "94", "95", "96", "97", "98", "99", "9a", "9b", "9c", "9d", "9e", "9f",
+        "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", "aa", "ab", "ac", "ad", "ae", "af",
+        "b0", "b1", "b2", "b3", "b4", "b5", "b6", "b7", "b8", "b9", "ba", "bb", "bc", "bd", "be", "bf",
+        "c0", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8", "c9", "ca", "cb", "cc", "cd", "ce", "cf",
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "da", "db", "dc", "dd", "de", "df",
+        "e0", "e1", "e2", "e3", "e4", "e5", "e6", "e7", "e8", "e9", "ea", "eb", "ec", "ed", "ee", "ef",
+        "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "fa", "fb", "fc", "fd", "fe", "ff"
+      ]
+    }
+  ],
+  "acc_i": [
+    {
+      "idx": "00000000",
+      "vec": [
+        "00000000",
+        "00000001",
+        "00000002",
+        "00000003",
+        "00000004",
+        "00000005",
+        "00000006",
+        "00000007",
+        "00000008",
+        "00000009",
+        "0000000a",
+        "0000000b",
+        "0000000c",
+        "0000000d",
+        "0000000e",
+        "0000000f"
+      ]
+    },
+    {
+      "idx": "00000001",
+      "vec": [
+        "00000000",
+        "00000001",
+        "00000002",
+        "00000003",
+        "00000004",
+        "00000005",
+        "00000006",
+        "00000007",
+        "00000008",
+        "00000009",
+        "0000000a",
+        "0000000b",
+        "0000000c",
+        "0000000d",
+        "0000000e",
+        "0000000f"
+      ]
+    } 
+  ],
+  "acc_o": [
+    {
+      "idx": "00000000",
+      "vec": [
+        "00000000",
+        "00000001",
+        "00000002",
+        "00000003",
+        "00000004",
+        "00000005",
+        "00000006",
+        "00000007",
+        "00000008",
+        "00000009",
+        "0000000a",
+        "0000000b",
+        "0000000c",
+        "0000000d",
+        "0000000e",
+        "0000000f"
+      ]
+    },
+    {
+      "idx": "00000001",
+      "vec": [
+        "00000001",
+        "00000012",
+        "00000023",
+        "00000034",
+        "00000045",
+        "00000056",
+        "00000067",
+        "00000078",
+        "ffffff89",
+        "ffffff9a",
+        "ffffffab",
+        "ffffffbc",
+        "ffffffcd",
+        "ffffffde",
+        "ffffffef",
+        "00000000"
+      ]
+    } 
+  ],
+  "uop": [
+    {
+      "idx": "00000000",
+      "vec": [
+        "00000000",
+        "00000000",
+        "00000000"
+      ]
+    },
+    {
+      "idx": "00000001",
+      "vec": [
+        "00000001",
+        "00000001",
+        "00000000"
+      ]
+    }
+  ]
+}
diff --git a/hardware/chisel/src/test/resources/gemm_2uop_overflow_cascaded.json b/hardware/chisel/src/test/resources/gemm_2uop_overflow_cascaded.json
new file mode 100644
index 0000000..9ca3e4f
--- /dev/null
+++ b/hardware/chisel/src/test/resources/gemm_2uop_overflow_cascaded.json
@@ -0,0 +1,188 @@
+{
+  "inst": {
+    "reset": "0",
+    "uop_begin": "0000",
+    "uop_end": "0002",
+    "lp_0": "0001",
+    "lp_1": "0001",
+    "acc_0": "000",
+    "acc_1": "000",
+    "inp_0": "000",
+    "inp_1": "000",
+    "wgt_0": "000",
+    "wgt_1": "000"
+  },
+  "inp": [
+    {
+      "idx": "00000000",
+      "vec": [
+        "01",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00"
+      ]
+    },
+    {
+      "idx": "00000001",
+      "vec": [
+        "ff",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00",
+        "00"
+      ]
+    }
+  ],
+  "wgt": [
+    {
+      "idx": "00000000",
+      "vec": [
+        "00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "0a", "0b", "0c", "0d", "0e", "0f",
+        "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "1a", "1b", "1c", "1d", "1e", "1f",
+        "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "2a", "2b", "2c", "2d", "2e", "2f",
+        "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "3a", "3b", "3c", "3d", "3e", "3f",
+        "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "4a", "4b", "4c", "4d", "4e", "4f",
+        "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "5a", "5b", "5c", "5d", "5e", "5f",
+        "60", "61", "62", "63", "64", "65", "66", "67", "68", "69", "6a", "6b", "6c", "6d", "6e", "6f",
+        "70", "71", "72", "73", "74", "75", "76", "77", "78", "79", "7a", "7b", "7c", "7d", "7e", "7f",
+        "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "8a", "8b", "8c", "8d", "8e", "8f",
+        "90", "91", "92", "93", "94", "95", "96", "97", "98", "99", "9a", "9b", "9c", "9d", "9e", "9f",
+        "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", "aa", "ab", "ac", "ad", "ae", "af",
+        "b0", "b1", "b2", "b3", "b4", "b5", "b6", "b7", "b8", "b9", "ba", "bb", "bc", "bd", "be", "bf",
+        "c0", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8", "c9", "ca", "cb", "cc", "cd", "ce", "cf",
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "da", "db", "dc", "dd", "de", "df",
+        "e0", "e1", "e2", "e3", "e4", "e5", "e6", "e7", "e8", "e9", "ea", "eb", "ec", "ed", "ee", "ef",
+        "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "fa", "fb", "fc", "fd", "fe", "ff"
+      ]
+    }
+  ],
+  "acc_i": [
+    {
+      "idx": "00000000",
+      "vec": [
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000"
+      ]
+    },
+    {
+      "idx": "00000001",
+      "vec": [
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000"
+      ]
+    } 
+  ],
+  "acc_o": [
+    {
+      "idx": "00000000",
+      "vec": [
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000"
+      ]
+    },
+    {
+      "idx": "00000001",
+      "vec": [
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000",
+        "00000000"
+      ]
+    } 
+  ],
+  "uop": [
+    {
+      "idx": "00000000",
+      "vec": [
+        "00000000",
+        "00000000",
+        "00000000"
+      ]
+    },
+    {
+      "idx": "00000001",
+      "vec": [
+        "00000000",
+        "00000001",
+        "00000000"
+      ]
+    }
+  ]
+}
diff --git a/hardware/chisel/src/test/scala/unittest/GemmTest.scala b/hardware/chisel/src/test/scala/unittest/GemmTest.scala
new file mode 100644
index 0000000..f548389
--- /dev/null
+++ b/hardware/chisel/src/test/scala/unittest/GemmTest.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package unittest
+
+import chisel3._
+import chisel3.util._
+import chisel3.iotesters.PeekPokeTester
+import vta.core._
+import vta.util.config._
+
+class MACTester(c: MAC) extends PeekPokeTester(c) {
+  poke(c.io.a, -1)
+  poke(c.io.b,  7)
+  poke(c.io.c, 10)
+  step(1)
+  expect(c.io.y, 3)
+  poke(c.io.a, -2)
+  poke(c.io.b,  7)
+  poke(c.io.c, 11)
+  step(1)
+  expect(c.io.y, -3)
+}
+
+class MACTest extends GenericTest("MACTest", (p:Parameters) => new MAC(),
+  (c:MAC) => new MACTester(c))
+
+class PipeAdderTester(c: PipeAdder) extends PeekPokeTester(c) {
+  poke(c.io.a, -1)
+  poke(c.io.b,  7)
+  step(1)
+  expect(c.io.y, 6)
+  poke(c.io.a, -2)
+  poke(c.io.b,  7)
+  step(1)
+  expect(c.io.y, 5)
+}
+
+class PipeAdderTest extends GenericTest("PipeAdderTest", (p:Parameters) => new PipeAdder(),
+  (c:PipeAdder) => new PipeAdderTester(c))
+
+class AdderTester(c: Adder) extends PeekPokeTester(c) {
+  poke(c.io.a, -1)
+  poke(c.io.b,  7)
+  expect(c.io.y, 6)
+  step(1)
+
+  poke(c.io.a, -2)
+  poke(c.io.b,  7)
+  expect(c.io.y, 5)
+  step(1)
+}
+
+class AdderTest extends GenericTest("AdderTest", (p:Parameters) => new Adder(),
+  (c:Adder) => new AdderTester(c))
+
+class DotProductTester(c: DotProduct) extends PeekPokeTester(c) {
+  for {i<- 0 until 16} {
+    poke(c.io.a(i), if (i %2 == 0) 1 else -1)
+    poke(c.io.b(i), i)
+  }
+  step(1)
+  for {i<- 0 until 16} {
+    poke(c.io.a(i), if (i %2 == 1) 1 else -1)
+    poke(c.io.b(i), i)
+  }
+  step(1)
+  expect(c.io.y, -8)
+  step(1)
+  expect(c.io.y,  8)
+}
+
+class DotProductTest extends GenericTest("DotProductTest", (p:Parameters) => new DotProduct(),
+  (c:DotProduct) => new DotProductTester(c))
diff --git a/hardware/chisel/src/test/scala/unittest/Generic.scala b/hardware/chisel/src/test/scala/unittest/Generic.scala
old mode 100755
new mode 100644
diff --git a/hardware/chisel/src/test/scala/unittest/TensorAluTest.scala b/hardware/chisel/src/test/scala/unittest/TensorAluTest.scala
old mode 100755
new mode 100644
diff --git a/hardware/chisel/src/test/scala/unittest/TensorGemmJsonTest.scala b/hardware/chisel/src/test/scala/unittest/TensorGemmJsonTest.scala
new file mode 100644
index 0000000..1e4f153
--- /dev/null
+++ b/hardware/chisel/src/test/scala/unittest/TensorGemmJsonTest.scala
@@ -0,0 +1,280 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package unittest
+
+import chisel3._
+import chisel3.util._
+import chisel3.iotesters.PeekPokeTester
+import unittest.util._
+import vta.core._
+import vta.util.config._
+
+import scala.io._
+import scala.language.postfixOps
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
+
+class TensorGemmJsonTester(c: TensorGemmPipelinedSplit, fn : String = "/x.json")
+  extends PeekPokeTester(c) {
+
+  val bufferedSource = Source.fromURL(getClass.getResource(fn))
+  val mapper = new ObjectMapper() with ScalaObjectMapper
+  mapper.registerModule(DefaultScalaModule)
+  val archState = mapper.readValue[Map[String, Object]](bufferedSource.reader())
+  bufferedSource.close
+
+  val inst = archState("inst").asInstanceOf[Map[String,String]]
+
+  def build_scratchpad(tag: String) : Array[Array[BigInt]] = {
+    val arr = archState(tag).asInstanceOf[Seq[Map[String,Object]]]
+    (
+      for {(m,i) <- arr zipWithIndex} yield {
+        val idx = BigInt(m("idx").asInstanceOf[String], 16)
+        assert(BigInt(i) == idx)
+        val vec = m("vec").asInstanceOf[Seq[String]]
+        (
+          for {v <- vec} yield {
+            BigInt(v, 16)
+          }
+        ).toArray
+      }
+    ).toArray
+  }
+
+  val inp_scratchpad = build_scratchpad("inp")
+  val wgt_scratchpad = build_scratchpad("wgt")
+  val uop_scratchpad = build_scratchpad("uop")
+  val acc_scratchpad = build_scratchpad("acc_i")
+  val acc_o_scratchpad = build_scratchpad("acc_o")
+
+  poke(c.io.start, 0)
+
+  val dec_reset = BigInt(inst("reset"), 16)
+  val uop_begin = BigInt(inst("uop_begin"), 16)
+  val uop_end = BigInt(inst("uop_end"), 16)
+  assert(uop_begin < uop_end)
+  val lp_0 = BigInt(inst("lp_0"), 16)
+  val lp_1 = BigInt(inst("lp_1"), 16)
+  val acc_0 = BigInt(inst("acc_0"), 16)
+  val inp_0 = BigInt(inst("inp_0"), 16)
+  val wgt_0 = BigInt(inst("wgt_0"), 16)
+  val acc_1 = BigInt(inst("acc_1"), 16)
+  val inp_1 = BigInt(inst("inp_1"), 16)
+  val wgt_1 = BigInt(inst("wgt_1"), 16)
+
+  poke(c.io.dec.reset, dec_reset)
+
+  poke(c.io.dec.uop_begin, uop_begin)
+  poke(c.io.dec.uop_end, uop_end)
+  poke(c.io.dec.lp_0, lp_0)
+  poke(c.io.dec.lp_1, lp_1)
+  poke(c.io.dec.acc_0, acc_0)
+  poke(c.io.dec.acc_1, acc_1)
+  poke(c.io.dec.inp_0, inp_0)
+  poke(c.io.dec.inp_1, inp_1)
+  poke(c.io.dec.wgt_0, wgt_0)
+  poke(c.io.dec.wgt_1, wgt_1)
+  // Don't need empty_0,{push,pop}_{next,prev},op
+
+  class TensorMasterMock(tm: TensorMaster, scratchpad : Array[Array[BigInt]]) {
+    poke(tm.rd(0).data.valid, 0)
+    var valid = peek(tm.rd(0).idx.valid)
+    var idx : Int = 0
+    def logical_step() {
+      if (valid == 1) {
+        poke(tm.rd(0).data.valid, 1)
+        val cols = tm.rd(0).data.bits(0).size
+        for {i <- 0 until tm.rd(0).data.bits.size
+          j <- 0 until cols
+        } {
+          poke(tm.rd(0).data.bits(i)(j), scratchpad(idx)(i*cols + j))
+        }
+      } else {
+        poke(tm.rd(0).data.valid, 0)
+      }
+      valid = peek(tm.rd(0).idx.valid)
+      idx = peek(tm.rd(0).idx.bits).toInt
+    }
+  }
+
+  class TensorMasterMockWr(tm: TensorMaster, scratchpad : Array[Array[BigInt]]) {
+    def logical_step() {
+      if (peek(tm.wr(0).valid) == 1) {
+        val idx = peek(tm.wr(0).bits.idx).toInt
+        val cols = tm.wr(0).bits.data(0).size
+        for {
+          i <- 0 until tm.wr(0).bits.data.size
+          j <- 0 until cols
+        } {
+          scratchpad(idx)(i*cols + j) = peek(tm.wr(0).bits.data(i)(j))
+        }
+      }
+    }
+  }
+
+  class UopMasterMock(um: UopMaster, scratchpad: Array[Array[BigInt]]) {
+    poke(um.data.valid, 0)
+    var valid = peek(um.idx.valid)
+    var idx : Int = 0
+    def logical_step() {
+      if (valid == 1) {
+        poke(um.data.valid, 1)
+        poke(um.data.bits.u0, scratchpad(idx)(0))
+        poke(um.data.bits.u1, scratchpad(idx)(1))
+        poke(um.data.bits.u2, scratchpad(idx)(2))
+      } else {
+        poke(um.data.valid, 0)
+      }
+      valid = peek(um.idx.valid)
+      idx = peek(um.idx.bits).toInt
+    }
+  }
+
+  class Mocks {
+    val uop_mock = new UopMasterMock(c.io.uop, uop_scratchpad)
+    val inp_mock = new TensorMasterMock(c.io.inp, inp_scratchpad)
+    val wgt_mock = new TensorMasterMock(c.io.wgt, wgt_scratchpad)
+    val acc_mock = new TensorMasterMock(c.io.acc, acc_scratchpad)
+    val acc_mock_wr = new TensorMasterMockWr(c.io.acc, acc_scratchpad)
+
+    val uop_indices = new scala.collection.mutable.Queue[BigInt]
+    val acc_indices = new scala.collection.mutable.Queue[BigInt]
+    val inp_indices = new scala.collection.mutable.Queue[BigInt]
+    val wgt_indices = new scala.collection.mutable.Queue[BigInt]
+    val accout_indices = new scala.collection.mutable.Queue[BigInt]
+    val out_indices = new scala.collection.mutable.Queue[BigInt]
+
+    def logical_step() {
+      step(1)
+      uop_mock.logical_step()
+      inp_mock.logical_step()
+      wgt_mock.logical_step()
+      acc_mock.logical_step()
+      acc_mock_wr.logical_step()
+
+      if (peek(c.io.uop.idx.valid) == 1) {
+        expect(c.io.uop.idx.bits, uop_indices.dequeue())
+      }
+      if (peek(c.io.acc.rd(0).idx.valid) == 1) {
+        expect(c.io.acc.rd(0).idx.bits, acc_indices.dequeue())
+      }
+      if (peek(c.io.inp.rd(0).idx.valid) == 1) {
+        expect(c.io.inp.rd(0).idx.bits, inp_indices.dequeue())
+      }
+      if (peek(c.io.wgt.rd(0).idx.valid) == 1) {
+        expect(c.io.wgt.rd(0).idx.bits, wgt_indices.dequeue())
+      }
+      if (peek(c.io.acc.wr(0).valid) == 1) {
+        expect(c.io.acc.wr(0).bits.idx, accout_indices.dequeue())
+      }
+      if (peek(c.io.out.wr(0).valid) == 1) {
+        expect(c.io.out.wr(0).bits.idx, out_indices.dequeue())
+      }
+    }
+
+    def test_if_done() {
+      println(s"uop_indices should be empty ${uop_indices.size}")
+      println(s"acc_indices should be empty ${acc_indices.size}")
+      println(s"inp_indices should be empty ${inp_indices.size}")
+      println(s"wgt_indices should be empty ${wgt_indices.size}")
+      println(s"accout_indices should be empty ${accout_indices.size}")
+      println(s"out_indices should be empty ${out_indices.size}")
+    }
+
+    def check() = {
+      val result = for {
+        ((x,y),idx) <- (acc_scratchpad, acc_o_scratchpad).zipped.toList.zipWithIndex
+      } yield {
+        (for {((xx,yy),jdx) <- (x,y).zipped.toList.zipWithIndex} yield {
+          if (xx != yy) {
+            println(s"Value mismatch at $idx $jdx: $xx (actual) != $yy (expected)")
+          }
+          xx == yy
+        }).reduce((x,y) => x&&y)
+      }
+      val result2 = result.reduce((x,y) => x&&y)
+      result2
+    }
+  }
+
+  val mocks = new Mocks
+
+  for {
+    cnt_o <- BigInt(0) until lp_0
+    cnt_i <- BigInt(0) until lp_1
+    uop_idx <- uop_begin until uop_end
+  } {
+    val u0 = uop_scratchpad(uop_idx.toInt)(0)
+    val u1 = uop_scratchpad(uop_idx.toInt)(1)
+    val u2 = uop_scratchpad(uop_idx.toInt)(2)
+
+    mocks.uop_indices.enqueue(uop_idx)
+    mocks.acc_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+    mocks.inp_indices.enqueue(u1 + inp_0*cnt_o + inp_1*cnt_i)
+    mocks.wgt_indices.enqueue(u2 + wgt_0*cnt_o + wgt_1*cnt_i)
+    mocks.accout_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+
+    if (dec_reset == 0) {
+      mocks.out_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+    }
+  }
+
+  poke(c.io.start, 0)
+  mocks.logical_step()
+  expect(c.io.state, c.sIdle)
+  poke(c.io.start, 1)
+
+  val total_steps = (uop_end-uop_begin)*lp_0*lp_1
+
+  val max_count = 100 + 4*total_steps
+  var count = 0
+  while (peek(c.io.done) == 0 && count < max_count) {
+    if (count % 100 == 0) {
+      println(s"logical_step $count")
+    }
+    mocks.logical_step()
+    if (count == 0) {
+      poke(c.io.start, 0)
+    }
+    count += 1
+  }
+
+  assert(peek(c.io.done) == 1, s"Signal done never high even after $count steps.")
+  println(s"Signal done high after $count steps.")
+
+  mocks.logical_step()
+  expect(c.io.done, 0)
+
+  val cc = mocks.check()
+  println(s"Checking acc with acc_o ${cc}")
+  assert(cc)
+
+  println(s"Total active steps: ${total_steps}")
+  mocks.test_if_done()
+}
+
+class TensorGemmJsonTestSingleUopOverflowOffset extends GenericTest("TensorGemmJson", (p:Parameters) =>
+  new TensorGemmPipelinedSplit()(p),
+  (c:TensorGemmPipelinedSplit) => new TensorGemmJsonTester(c, "/gemm_1uop_overflow_offset.json"))
+
+class TensorGemmJsonTestDoubleUopOverflowCascaded extends GenericTest("TensorGemmJson", (p:Parameters) =>
+  new TensorGemmPipelinedSplit()(p),
+  (c:TensorGemmPipelinedSplit) => new TensorGemmJsonTester(c, "/gemm_2uop_overflow_cascaded.json"))
diff --git a/hardware/chisel/src/test/scala/unittest/TensorGemmTest.scala b/hardware/chisel/src/test/scala/unittest/TensorGemmTest.scala
new file mode 100644
index 0000000..6b2234c
--- /dev/null
+++ b/hardware/chisel/src/test/scala/unittest/TensorGemmTest.scala
@@ -0,0 +1,742 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package unittest
+
+import chisel3._
+import chisel3.util._
+import chisel3.iotesters.PeekPokeTester
+import unittest.util._
+import vta.core._
+import vta.util.config._
+
+class TensorGemmTester(c: TensorGemmSimple) extends PeekPokeTester(c) {
+  poke(c.io.start, 0)
+  poke(c.io.dec.reset, 0)
+  poke(c.io.dec.uop_begin, 0)
+  poke(c.io.dec.uop_end, 1)
+  poke(c.io.dec.lp_0, 1)
+  poke(c.io.dec.lp_1, 1)
+  poke(c.io.dec.acc_0, 1)
+  poke(c.io.dec.acc_1, 1)
+  poke(c.io.dec.inp_0, 1)
+  poke(c.io.dec.inp_1, 1)
+  poke(c.io.dec.wgt_0, 1)
+  poke(c.io.dec.wgt_1, 1)
+  // Don't need empty_0, {push, pop}_{next, prev}, op
+
+  poke(c.io.uop.data.bits.u0, 0)
+  poke(c.io.uop.data.bits.u1, 0)
+  poke(c.io.uop.data.bits.u2, 0)
+
+  val inp = IndexedSeq.fill(c.io.inp.rd(0).data.bits(0).size){BigInt(1)}
+  for {lhs <- c.io.inp.rd(0).data.bits} {
+    poke(lhs, inp.reverse)
+  }
+
+  val wgt = IndexedSeq.fill(c.io.wgt.rd(0).data.bits(0).size){BigInt(1)}
+  for {lhs <- c.io.wgt.rd(0).data.bits} {
+    poke(lhs, wgt.reverse)
+  }
+
+  val acc = IndexedSeq.fill(c.io.acc.rd(0).data.bits(0).size){BigInt(1)}
+  for {lhs <- c.io.acc.rd(0).data.bits} {
+    poke(lhs, acc.reverse)
+  }
+
+  class TensorMasterMock(tm: TensorMaster) {
+    poke(tm.rd(0).data.valid, 0)
+    var valid = peek(tm.rd(0).idx.valid)
+
+    def logical_step(v: BigInt) {
+      poke(tm.rd(0).data.valid, valid)
+      valid = peek(tm.rd(0).idx.valid)
+      expect(tm.rd(0).idx.valid, v)
+    }
+  }
+
+  class UopMasterMock(um: UopMaster) {
+    poke(um.data.valid, 0)
+    var valid = peek(um.idx.valid)
+
+    def logical_step(v: BigInt) {
+      poke(um.data.valid, valid)
+      valid = peek(um.idx.valid)
+      expect(um.idx.valid, v)
+    }
+  }
+
+  class Mocks {
+    val uop_mock = new UopMasterMock(c.io.uop)
+    val inp_mock = new TensorMasterMock(c.io.inp)
+    val wgt_mock = new TensorMasterMock(c.io.wgt)
+    val acc_mock = new TensorMasterMock(c.io.acc)
+
+    def logical_step(sram_valid: BigInt, uop_valid: BigInt) {
+      step(1)
+      uop_mock.logical_step(uop_valid)
+      inp_mock.logical_step(sram_valid)
+      wgt_mock.logical_step(sram_valid)
+      acc_mock.logical_step(sram_valid)
+    }
+  }
+
+  val mocks = new Mocks
+  poke(c.io.start, 0)
+
+  step(1)
+
+  expect(c.io.state, c.sIdle)
+
+  poke(c.io.start, 1)
+  mocks.logical_step(0, 1)
+  expect(c.io.state, c.sReadUop)
+
+  expect(c.io.out.wr(0).valid, 0)
+  expect(c.io.acc.wr(0).valid, 0)
+
+  poke(c.io.start, 0)
+
+  mocks.logical_step(0, 0)
+  expect(c.io.state, c.sComputeIdx)
+  expect(c.io.out.wr(0).valid, 0)
+  expect(c.io.acc.wr(0).valid, 0)
+
+  mocks.logical_step(1, 0)
+  expect(c.io.state, c.sReadTensor)
+  expect(c.io.out.wr(0).valid, 0)
+  expect(c.io.acc.wr(0).valid, 0)
+
+  mocks.logical_step(0, 0)
+  expect(c.io.state, c.sExe)
+  expect(c.io.out.wr(0).valid, 0)
+  expect(c.io.acc.wr(0).valid, 0)
+  expect(c.io.done, 0)
+
+  mocks.logical_step(0, 0)
+  expect(c.io.state, c.sWait)
+  expect(c.io.inflight, 1)
+
+  expect(c.io.out.wr(0).valid, 0)
+  expect(c.io.acc.wr(0).valid, 0)
+
+  mocks.logical_step(0, 0)
+  expect(c.io.state, c.sWait)
+  expect(c.io.inflight, 1)
+
+  expect(c.io.out.wr(0).valid, 1)
+  expect(c.io.acc.wr(0).valid, 1)
+
+  mocks.logical_step(0, 0)
+  expect(c.io.state, c.sWait)
+  expect(c.io.inflight, 0)
+
+  expect(c.io.out.wr(0).valid, 0)
+  expect(c.io.acc.wr(0).valid, 0)
+
+  mocks.logical_step(0, 0)
+  expect(c.io.state, c.sIdle)
+  expect(c.io.inflight, 0)
+
+  expect(c.io.out.wr(0).valid, 0)
+  expect(c.io.acc.wr(0).valid, 0)
+
+}
+
+class TensorGemmTest extends GenericTest("TensorGemm", (p:Parameters) => new TensorGemmSimple()(p),
+  (c:TensorGemmSimple) => new TensorGemmTester(c))
+
+class TensorGemmIdxTester(c: TensorGemmSimple) extends PeekPokeTester(c) {
+
+  poke(c.io.start, 0)
+
+  val uop_begin = 0
+  val uop_end = 2
+  assert(uop_begin < uop_end)
+  val lp_0 = 2
+  val lp_1 = 3
+  val acc_0 = 1*lp_1
+  val inp_0 = 2*lp_1
+  val wgt_0 = 4*lp_1
+  val acc_1 = 1
+  val inp_1 = 2
+  val wgt_1 = 4
+  val u0 = BigInt("000", 16)
+  val u1 = BigInt("100", 16)
+  val u2 = BigInt("200", 16)
+
+  poke(c.io.dec.reset, 0)
+  poke(c.io.dec.uop_begin, uop_begin)
+  poke(c.io.dec.uop_end, uop_end)
+  poke(c.io.dec.lp_0, lp_0)
+  poke(c.io.dec.lp_1, lp_1)
+  poke(c.io.dec.acc_0, acc_0)
+  poke(c.io.dec.acc_1, acc_1)
+  poke(c.io.dec.inp_0, inp_0)
+  poke(c.io.dec.inp_1, inp_1)
+  poke(c.io.dec.wgt_0, wgt_0)
+  poke(c.io.dec.wgt_1, wgt_1)
+  // Don't need empty_0,{push,pop}_{next,prev},op
+
+  poke(c.io.uop.data.bits.u0, u0)
+  poke(c.io.uop.data.bits.u1, u1)
+  poke(c.io.uop.data.bits.u2, u2)
+
+  val inp = IndexedSeq.fill(c.io.inp.rd(0).data.bits(0).size){BigInt(1)}
+  for {lhs <- c.io.inp.rd(0).data.bits} {
+    poke(lhs, inp.reverse)
+  }
+
+  val wgt = IndexedSeq.fill(c.io.wgt.rd(0).data.bits(0).size){BigInt(1)}
+  for {lhs <- c.io.wgt.rd(0).data.bits} {
+    poke(lhs, wgt.reverse)
+  }
+
+  val acc = IndexedSeq.fill(c.io.acc.rd(0).data.bits(0).size){BigInt(1)}
+  for {lhs <- c.io.acc.rd(0).data.bits} {
+    poke(lhs, acc.reverse)
+  }
+
+  class TensorMasterMock(tm: TensorMaster) {
+    poke(tm.rd(0).data.valid, 0)
+    var valid = peek(tm.rd(0).idx.valid)
+    def logical_step(v: BigInt) {
+      poke(tm.rd(0).data.valid, valid)
+      valid = peek(tm.rd(0).idx.valid)
+      expect(tm.rd(0).idx.valid, v)
+    }
+  }
+
+  class UopMasterMock(um: UopMaster) {
+    poke(um.data.valid, 0)
+    var valid = peek(um.idx.valid)
+    def logical_step(v: BigInt) {
+      poke(um.data.valid, valid)
+      valid = peek(um.idx.valid)
+      expect(um.idx.valid, v)
+    }
+  }
+
+  class Mocks {
+    val uop_mock = new UopMasterMock(c.io.uop)
+    val inp_mock = new TensorMasterMock(c.io.inp)
+    val wgt_mock = new TensorMasterMock(c.io.wgt)
+    val acc_mock = new TensorMasterMock(c.io.acc)
+
+    val uop_indices = new scala.collection.mutable.Queue[BigInt]
+    val acc_indices = new scala.collection.mutable.Queue[BigInt]
+    val inp_indices = new scala.collection.mutable.Queue[BigInt]
+    val wgt_indices = new scala.collection.mutable.Queue[BigInt]
+    val accout_indices = new scala.collection.mutable.Queue[BigInt]
+    val out_indices = new scala.collection.mutable.Queue[BigInt]
+
+    def logical_step(sram_valid: BigInt, uop_valid: BigInt) {
+      step(1)
+      uop_mock.logical_step(uop_valid)
+      inp_mock.logical_step(sram_valid)
+      wgt_mock.logical_step(sram_valid)
+      acc_mock.logical_step(sram_valid)
+      if (peek(c.io.uop.idx.valid) == 1) {
+        expect(c.io.uop.idx.bits, uop_indices.dequeue())
+      }
+      if (peek(c.io.acc.rd(0).idx.valid) == 1) {
+        expect(c.io.acc.rd(0).idx.bits, acc_indices.dequeue())
+      }
+      if (peek(c.io.inp.rd(0).idx.valid) == 1) {
+        expect(c.io.inp.rd(0).idx.bits, inp_indices.dequeue())
+      }
+      if (peek(c.io.wgt.rd(0).idx.valid) == 1) {
+        expect(c.io.wgt.rd(0).idx.bits, wgt_indices.dequeue())
+      }
+      if (peek(c.io.acc.wr(0).valid) == 1) {
+        expect(c.io.acc.wr(0).bits.idx, accout_indices.dequeue())
+      }
+      if (peek(c.io.out.wr(0).valid) == 1) {
+        expect(c.io.out.wr(0).bits.idx, out_indices.dequeue())
+      }
+    }
+
+    def test_if_done() {
+      assert(uop_indices.isEmpty)
+      assert(acc_indices.isEmpty)
+      assert(inp_indices.isEmpty)
+      assert(wgt_indices.isEmpty)
+    }
+  }
+
+  val mocks = new Mocks
+  for {
+    cnt_o <- 0 until lp_0
+    cnt_i <- 0 until lp_1
+    uop_idx <- uop_begin until uop_end
+  } {
+    mocks.uop_indices.enqueue(uop_idx)
+    mocks.acc_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+    mocks.inp_indices.enqueue(u1 + inp_0*cnt_o + inp_1*cnt_i)
+    mocks.wgt_indices.enqueue(u2 + wgt_0*cnt_o + wgt_1*cnt_i)
+    mocks.accout_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+    mocks.out_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+  }
+
+  poke(c.io.start, 0)
+  step(1)
+  expect(c.io.state, c.sIdle)
+
+  poke(c.io.start, 1)
+
+  for {q <- 0 until (uop_end-uop_begin)*lp_0*lp_1} {
+    mocks.logical_step(0, 1)
+    expect(c.io.out.wr(0).valid, 0)
+    expect(c.io.acc.wr(0).valid, 0)
+
+    poke(c.io.start, 0)
+
+    mocks.logical_step(0, 0)
+    expect(c.io.out.wr(0).valid, if (q > 0) 1 else 0)
+    expect(c.io.acc.wr(0).valid, if (q > 0) 1 else 0)
+
+    mocks.logical_step(1, 0)
+    expect(c.io.out.wr(0).valid, 0)
+    expect(c.io.acc.wr(0).valid, 0)
+
+    mocks.logical_step(0, 0)
+    expect(c.io.out.wr(0).valid, 0)
+    expect(c.io.acc.wr(0).valid, 0)
+    expect(c.io.done, 0)
+  }
+
+  mocks.logical_step(0, 0)
+  expect(c.io.inflight, 1)
+
+  expect(c.io.out.wr(0).valid, 0)
+  expect(c.io.acc.wr(0).valid, 0)
+
+  mocks.logical_step(0, 0)
+  expect(c.io.inflight, 1)
+
+  expect(c.io.out.wr(0).valid, 1)
+  expect(c.io.acc.wr(0).valid, 1)
+
+  mocks.logical_step(0, 0)
+  expect(c.io.inflight, 0)
+
+  expect(c.io.out.wr(0).valid, 0)
+  expect(c.io.acc.wr(0).valid, 0)
+
+  mocks.logical_step(0, 0)
+  expect(c.io.inflight, 0)
+
+  expect(c.io.out.wr(0).valid, 0)
+  expect(c.io.acc.wr(0).valid, 0)
+
+  mocks.test_if_done()
+}
+
+class TensorGemmIdxTest extends GenericTest("TensorGemmIdx", (p:Parameters) => new TensorGemmSimple()(p),
+  (c:TensorGemmSimple) => new TensorGemmIdxTester(c))
+
+class TensorGemmIndexGeneratorTester(c: TensorGemmIndexGenerator) extends PeekPokeTester(c) {
+  val uop_begin = 0
+  val uop_end = 2
+  assert(uop_begin < uop_end)
+  val lp_0 = 2
+  val lp_1 = 3
+  val acc_0 = 1*lp_1
+  val inp_0 = 2*lp_1
+  val wgt_0 = 4*lp_1
+  val acc_1 = 1
+  val inp_1 = 2
+  val wgt_1 = 4
+
+  poke(c.io.dec.reset, 0)
+  poke(c.io.dec.uop_begin, uop_begin)
+  poke(c.io.dec.uop_end, uop_end)
+  poke(c.io.dec.lp_0, lp_0)
+  poke(c.io.dec.lp_1, lp_1)
+  poke(c.io.dec.acc_0, acc_0)
+  poke(c.io.dec.acc_1, acc_1)
+  poke(c.io.dec.inp_0, inp_0)
+  poke(c.io.dec.inp_1, inp_1)
+  poke(c.io.dec.wgt_0, wgt_0)
+  poke(c.io.dec.wgt_1, wgt_1)
+  // Don't need empty_0,{push,pop}_{next,prev},op
+
+  class Mocks {
+    val uop_indices = new scala.collection.mutable.Queue[BigInt]
+    val acc_indices = new scala.collection.mutable.Queue[BigInt]
+    val inp_indices = new scala.collection.mutable.Queue[BigInt]
+    val wgt_indices = new scala.collection.mutable.Queue[BigInt]
+
+    def logical_step() {
+      step(1)
+      if (peek(c.io.valid) == 1) {
+        expect(c.io.uop_idx, uop_indices.dequeue())
+        expect(c.io.acc_i, acc_indices.dequeue())
+        expect(c.io.inp_i, inp_indices.dequeue())
+        expect(c.io.wgt_i, wgt_indices.dequeue())
+      }
+    }
+
+    def test_if_done() {
+      println(s"uop_indices remaining: ${uop_indices.size}")
+      println(s"acc_indices remaining: ${acc_indices.size}")
+      println(s"inp_indices remaining: ${inp_indices.size}")
+      println(s"wgt_indices remaining: ${wgt_indices.size}")
+      assert(uop_indices.isEmpty)
+      assert(acc_indices.isEmpty)
+      assert(inp_indices.isEmpty)
+      assert(wgt_indices.isEmpty)
+    }
+  }
+
+  val mocks = new Mocks
+  for {
+    cnt_o <- 0 until lp_0
+    cnt_i <- 0 until lp_1
+    uop_idx <- uop_begin until uop_end
+  } {
+    mocks.uop_indices.enqueue(uop_idx)
+    mocks.acc_indices.enqueue(acc_0*cnt_o + acc_1*cnt_i)
+    mocks.inp_indices.enqueue(inp_0*cnt_o + inp_1*cnt_i)
+    mocks.wgt_indices.enqueue(wgt_0*cnt_o + wgt_1*cnt_i)
+  }
+
+  poke(c.io.start, 1)
+  mocks.logical_step()
+  poke(c.io.start, 0)
+
+  val end = (uop_end-uop_begin)*lp_0*lp_1
+  var count = 0
+  while(peek(c.io.last) == 0 && count < 10*end + 100) {
+    mocks.logical_step()
+    count += 1
+  }
+  mocks.test_if_done()
+}
+
+class TensorGemmIndexGeneratorTest extends GenericTest("TensorGemmIndexGenerator",
+  (p:Parameters) => new TensorGemmIndexGenerator()(p),
+  (c:TensorGemmIndexGenerator) => new TensorGemmIndexGeneratorTester(c))
+
+class TensorGemmPipelinedTester(c: TensorGemmPipelinedSplit) extends PeekPokeTester(c) {
+  poke(c.io.start, 0)
+
+  val uop_begin = 0
+  val uop_end = 2
+  assert(uop_begin < uop_end)
+  val lp_0 = 2
+  val lp_1 = 3
+  val acc_0 = 1*lp_1
+  val inp_0 = 2*lp_1
+  val wgt_0 = 4*lp_1
+  val acc_1 = 1
+  val inp_1 = 2
+  val wgt_1 = 4
+  val u0 = BigInt("000", 16)
+  val u1 = BigInt("100", 16)
+  val u2 = BigInt("200", 16)
+
+  poke(c.io.dec.reset, 0)
+  poke(c.io.dec.uop_begin, uop_begin)
+  poke(c.io.dec.uop_end, uop_end)
+  poke(c.io.dec.lp_0, lp_0)
+  poke(c.io.dec.lp_1, lp_1)
+  poke(c.io.dec.acc_0, acc_0)
+  poke(c.io.dec.acc_1, acc_1)
+  poke(c.io.dec.inp_0, inp_0)
+  poke(c.io.dec.inp_1, inp_1)
+  poke(c.io.dec.wgt_0, wgt_0)
+  poke(c.io.dec.wgt_1, wgt_1)
+  // Don't need empty_0,{push,pop}_{next,prev},op
+
+  poke(c.io.uop.data.bits.u0, u0)
+  poke(c.io.uop.data.bits.u1, u1)
+  poke(c.io.uop.data.bits.u2, u2)
+
+  val inp = IndexedSeq.fill(c.io.inp.rd(0).data.bits(0).size){BigInt(1)}
+  for {lhs <- c.io.inp.rd(0).data.bits} {
+    poke(lhs, inp.reverse)
+  }
+
+  val wgt = IndexedSeq.fill(c.io.wgt.rd(0).data.bits(0).size){BigInt(1)}
+  for {lhs <- c.io.wgt.rd(0).data.bits} {
+    poke(lhs, wgt.reverse)
+  }
+
+  val acc = IndexedSeq.fill(c.io.acc.rd(0).data.bits(0).size){BigInt(1)}
+  for {lhs <- c.io.acc.rd(0).data.bits} {
+    poke(lhs, acc.reverse)
+  }
+
+  class TensorMasterMock(tm: TensorMaster) {
+    poke(tm.rd(0).data.valid, 0)
+    var valid = peek(tm.rd(0).idx.valid)
+    def logical_step(v: Option[BigInt]) {
+      poke(tm.rd(0).data.valid, valid)
+      valid = peek(tm.rd(0).idx.valid)
+      for {x <- v} expect(tm.rd(0).idx.valid, x)
+    }
+  }
+
+  class UopMasterMock(um: UopMaster) {
+    poke(um.data.valid, 0)
+    var valid = peek(um.idx.valid)
+    def logical_step(v: Option[BigInt]) {
+      poke(um.data.valid, valid)
+      valid = peek(um.idx.valid)
+      for {x <- v} expect(um.idx.valid, x)
+    }
+  }
+
+  class Mocks {
+    val uop_mock = new UopMasterMock(c.io.uop)
+    val inp_mock = new TensorMasterMock(c.io.inp)
+    val wgt_mock = new TensorMasterMock(c.io.wgt)
+    val acc_mock = new TensorMasterMock(c.io.acc)
+
+    val uop_indices = new scala.collection.mutable.Queue[BigInt]
+    val acc_indices = new scala.collection.mutable.Queue[BigInt]
+    val inp_indices = new scala.collection.mutable.Queue[BigInt]
+    val wgt_indices = new scala.collection.mutable.Queue[BigInt]
+    val accout_indices = new scala.collection.mutable.Queue[BigInt]
+    val out_indices = new scala.collection.mutable.Queue[BigInt]
+
+    def logical_step() {
+      step(1)
+      uop_mock.logical_step(None)
+      inp_mock.logical_step(None)
+      wgt_mock.logical_step(None)
+      acc_mock.logical_step(None)
+      if (peek(c.io.uop.idx.valid) == 1) {
+        expect(c.io.uop.idx.bits, uop_indices.dequeue())
+      }
+      if (peek(c.io.acc.rd(0).idx.valid) == 1) {
+        expect(c.io.acc.rd(0).idx.bits, acc_indices.dequeue())
+      }
+      if (peek(c.io.inp.rd(0).idx.valid) == 1) {
+        expect(c.io.inp.rd(0).idx.bits, inp_indices.dequeue())
+      }
+      if (peek(c.io.wgt.rd(0).idx.valid) == 1) {
+        expect(c.io.wgt.rd(0).idx.bits, wgt_indices.dequeue())
+      }
+      if (peek(c.io.acc.wr(0).valid) == 1) {
+        expect(c.io.acc.wr(0).bits.idx, accout_indices.dequeue())
+      }
+      if (peek(c.io.out.wr(0).valid) == 1) {
+        expect(c.io.out.wr(0).bits.idx, out_indices.dequeue())
+      }
+    }
+
+    def test_if_done() {
+      println(s"uop_indices remaining: ${uop_indices.size}")
+      println(s"acc_indices remaining: ${acc_indices.size}")
+      println(s"inp_indices remaining: ${inp_indices.size}")
+      println(s"wgt_indices remaining: ${wgt_indices.size}")
+      println(s"accout_indices remaining: ${accout_indices.size}")
+      println(s"out_indices remaining: ${out_indices.size}")
+    }
+  }
+
+  val mocks = new Mocks
+  for {
+    cnt_o <- 0 until lp_0
+    cnt_i <- 0 until lp_1
+    uop_idx <- uop_begin until uop_end
+  } {
+    mocks.uop_indices.enqueue(uop_idx)
+    mocks.acc_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+    mocks.inp_indices.enqueue(u1 + inp_0*cnt_o + inp_1*cnt_i)
+    mocks.wgt_indices.enqueue(u2 + wgt_0*cnt_o + wgt_1*cnt_i)
+    mocks.accout_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+    mocks.out_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+  }
+
+  poke(c.io.start, 0)
+  step(1)
+  expect(c.io.state, c.sIdle)
+  poke(c.io.start, 1)
+
+  var count = 0
+  val end = (uop_end-uop_begin)*lp_0*lp_1
+
+  while (peek(c.io.done) == 0 && count < 10*end + 100) {
+    mocks.logical_step()
+    poke(c.io.start, 0)
+  }
+
+  expect(c.io.done, 1)
+  mocks.test_if_done()
+}
+
+class TensorGemmPipelinedTest extends GenericTest("TensorGemmPipelined",
+  (p:Parameters) => new TensorGemmPipelinedSplit()(p),
+  (c:TensorGemmPipelinedSplit) => new TensorGemmPipelinedTester(c))
+
+class TensorGemmResetTester(c: TensorGemm) extends PeekPokeTester(c) {
+  poke(c.io.start, 0)
+
+  val uop_begin = 0
+  val uop_end = 2
+  assert(uop_begin < uop_end)
+  val lp_0 = 2
+  val lp_1 = 3
+  val acc_0 = 1*lp_1
+  val inp_0 = 2*lp_1
+  val wgt_0 = 4*lp_1
+  val acc_1 = 1
+  val inp_1 = 2
+  val wgt_1 = 4
+  val u0 = BigInt("000", 16)
+  val u1 = BigInt("100", 16)
+  val u2 = BigInt("200", 16)
+  val dec_reset = 1
+
+  poke(c.io.dec.reset, dec_reset)
+  poke(c.io.dec.uop_begin, uop_begin)
+  poke(c.io.dec.uop_end, uop_end)
+  poke(c.io.dec.lp_0, lp_0)
+  poke(c.io.dec.lp_1, lp_1)
+  poke(c.io.dec.acc_0, acc_0)
+  poke(c.io.dec.acc_1, acc_1)
+  poke(c.io.dec.inp_0, inp_0)
+  poke(c.io.dec.inp_1, inp_1)
+  poke(c.io.dec.wgt_0, wgt_0)
+  poke(c.io.dec.wgt_1, wgt_1)
+  // Don't need empty_0,{push,pop}_{next,prev},op
+
+  poke(c.io.uop.data.bits.u0, u0)
+  poke(c.io.uop.data.bits.u1, u1)
+  poke(c.io.uop.data.bits.u2, u2)
+
+  val inp = IndexedSeq.fill(c.io.inp.rd(0).data.bits(0).size){BigInt(1)}
+  for {lhs <- c.io.inp.rd(0).data.bits} {
+    poke(lhs, inp.reverse)
+  }
+
+  val wgt = IndexedSeq.fill(c.io.wgt.rd(0).data.bits(0).size){BigInt(1)}
+  for {lhs <- c.io.wgt.rd(0).data.bits} {
+    poke(lhs, wgt.reverse)
+  }
+
+  val acc = IndexedSeq.fill(c.io.acc.rd(0).data.bits(0).size){BigInt(1)}
+  for {lhs <- c.io.acc.rd(0).data.bits} {
+    poke(lhs, acc.reverse)
+  }
+
+  class TensorMasterMock(tm: TensorMaster) {
+    poke(tm.rd(0).data.valid, 0)
+    var valid = peek(tm.rd(0).idx.valid)
+    def logical_step(v: Option[BigInt]) {
+      poke(tm.rd(0).data.valid, valid)
+      valid = peek(tm.rd(0).idx.valid)
+      for {x <- v} expect(tm.rd(0).idx.valid, x)
+    }
+  }
+
+  class UopMasterMock(um: UopMaster) {
+    poke(um.data.valid, 0)
+    var valid = peek(um.idx.valid)
+    def logical_step(v: Option[BigInt]) {
+      poke(um.data.valid, valid)
+      valid = peek(um.idx.valid)
+      for {x <- v} expect(um.idx.valid, x)
+    }
+  }
+
+  class Mocks {
+    val uop_mock = new UopMasterMock(c.io.uop)
+    val inp_mock = new TensorMasterMock(c.io.inp)
+    val wgt_mock = new TensorMasterMock(c.io.wgt)
+    val acc_mock = new TensorMasterMock(c.io.acc)
+
+    val uop_indices = new scala.collection.mutable.Queue[BigInt]
+    val acc_indices = new scala.collection.mutable.Queue[BigInt]
+    val inp_indices = new scala.collection.mutable.Queue[BigInt]
+    val wgt_indices = new scala.collection.mutable.Queue[BigInt]
+    val accout_indices = new scala.collection.mutable.Queue[BigInt]
+    val out_indices = new scala.collection.mutable.Queue[BigInt]
+
+    def logical_step(sram_valid: BigInt, uop_valid: BigInt) {
+      step(1)
+      uop_mock.logical_step(None)
+      inp_mock.logical_step(None)
+      wgt_mock.logical_step(None)
+      acc_mock.logical_step(None)
+      if (peek(c.io.uop.idx.valid) == 1) {
+        expect(c.io.uop.idx.bits, uop_indices.dequeue())
+      }
+      if (peek(c.io.acc.rd(0).idx.valid) == 1) {
+        expect(c.io.acc.rd(0).idx.bits, acc_indices.dequeue())
+      }
+      if (peek(c.io.inp.rd(0).idx.valid) == 1) {
+        expect(c.io.inp.rd(0).idx.bits, inp_indices.dequeue())
+      }
+      if (peek(c.io.wgt.rd(0).idx.valid) == 1) {
+        expect(c.io.wgt.rd(0).idx.bits, wgt_indices.dequeue())
+      }
+      if (peek(c.io.acc.wr(0).valid) == 1) {
+        expect(c.io.acc.wr(0).bits.idx, accout_indices.dequeue())
+      }
+      if (peek(c.io.out.wr(0).valid) == 1) {
+        expect(c.io.out.wr(0).bits.idx, out_indices.dequeue())
+      }
+    }
+
+    def test_if_done() {
+      assert(uop_indices.isEmpty)
+      assert(acc_indices.isEmpty)
+      assert(inp_indices.isEmpty)
+      assert(wgt_indices.isEmpty)
+      assert(accout_indices.isEmpty)
+      assert(out_indices.isEmpty)
+    }
+  }
+
+  val mocks = new Mocks
+  for {
+    cnt_o <- 0 until lp_0
+    cnt_i <- 0 until lp_1
+    uop_idx <- uop_begin until uop_end
+  } {
+    mocks.uop_indices.enqueue(uop_idx)
+    mocks.acc_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+    mocks.inp_indices.enqueue(u1 + inp_0*cnt_o + inp_1*cnt_i)
+    mocks.wgt_indices.enqueue(u2 + wgt_0*cnt_o + wgt_1*cnt_i)
+    mocks.accout_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+
+    if (dec_reset == 0) {
+      mocks.out_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+    }
+  }
+
+  poke(c.io.start, 0)
+  step(1)
+  expect(c.io.state, c.sIdle)
+  poke(c.io.start, 1)
+
+  while(peek(c.io.done) == 0) {
+    mocks.logical_step(0, 0)
+    poke(c.io.start, 0)
+  }
+
+  mocks.test_if_done()
+}
+
+class TensorGemmResetTest extends GenericTest("TensorGemmReset", (p:Parameters) => new TensorGemm()(p),
+  (c:TensorGemm) => new TensorGemmResetTester(c))