Port to new Chisel stable release (3.5) (#37)

* upgrade to stable chisel 3.5.0

* fix chiesl 3.5 warnings

* port tests to chiseltest

* change chisel hardware makefile to use sbt test to run unittests
diff --git a/hardware/chisel/Makefile b/hardware/chisel/Makefile
index bbff447..2d43944 100644
--- a/hardware/chisel/Makefile
+++ b/hardware/chisel/Makefile
@@ -51,7 +51,6 @@
 USE_TRACE_DETAILED = 0
 USE_THREADS = 0
 VTA_LIBNAME = libvta_hw
-UNITTEST_NAME = all
 CXX = g++
 # A debug build with DEBUG = 1 is useful to trace the simulation with a
 # debugger.
@@ -194,7 +193,7 @@
 	sbt 'runMain vta.$(CONFIG_TEST) --target-dir $(chisel_build_dir) -o $(TOP_TEST).$(CONFIG)'
 
 unittest:
-	sbt 'test:runMain unittest.Launcher $(UNITTEST_NAME)'
+	sbt test
 
 clean:
 	-rm -rf target project/target project/project test_run_dir
diff --git a/hardware/chisel/build.sbt b/hardware/chisel/build.sbt
index 49c7951..a28e29d 100644
--- a/hardware/chisel/build.sbt
+++ b/hardware/chisel/build.sbt
@@ -21,21 +21,18 @@
 version := "0.1.0-SNAPSHOT"
 organization := "edu.washington.cs"
 
-scalaVersion := "2.12.13"
+scalaVersion := "2.12.15"
 scalacOptions ++= Seq(
-  "-Xsource:2.11",
   "-language:reflectiveCalls",
   "-deprecation",
   "-feature",
   "-Xcheckinit",
 )
 
-resolvers += Resolver.sonatypeRepo("snapshots")
-libraryDependencies += "edu.berkeley.cs" %% "chisel3" % "3.4.3"
-libraryDependencies += "edu.berkeley.cs" %% "chisel-iotesters" % "1.5.3"
+libraryDependencies += "edu.berkeley.cs" %% "chisel3" % "3.5.0"
+libraryDependencies += "edu.berkeley.cs" %% "chiseltest" % "0.5.0"
 
 libraryDependencies += "com.fasterxml.jackson.core" % "jackson-databind" % "2.10.3"
 libraryDependencies += "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.10.3"
 
-addCompilerPlugin("edu.berkeley.cs" % "chisel3-plugin" % "3.4.3" cross CrossVersion.full)
-addCompilerPlugin("org.scalamacros" % "paradise" % "2.1.1" cross CrossVersion.full)
+addCompilerPlugin("edu.berkeley.cs" % "chisel3-plugin" % "3.5.0" cross CrossVersion.full)
diff --git a/hardware/chisel/src/main/scala/core/FetchVME64.scala b/hardware/chisel/src/main/scala/core/FetchVME64.scala
index 2f51472..dda87b0 100644
--- a/hardware/chisel/src/main/scala/core/FetchVME64.scala
+++ b/hardware/chisel/src/main/scala/core/FetchVME64.scala
@@ -185,7 +185,7 @@
       printf("[Fetch] Launch\n")
     }
     // instruction
-    when(inst_q.io.deq.fire()) {
+    when(inst_q.io.deq.fire) {
       when(dec.io.isLoad) {
         printf("[Fetch] [instruction decode] [L] %x\n", inst_q.io.deq.bits)
       }
diff --git a/hardware/chisel/src/main/scala/core/FetchWideVME.scala b/hardware/chisel/src/main/scala/core/FetchWideVME.scala
index 1f55b91..711018b 100644
--- a/hardware/chisel/src/main/scala/core/FetchWideVME.scala
+++ b/hardware/chisel/src/main/scala/core/FetchWideVME.scala
@@ -74,7 +74,7 @@
   val xsize = io.ins_count << log2Ceil(elemsInInstr)
   // max size of transfer is limited by a buffer size
   val xmax = (((1 << mp.lenBits) << log2Ceil(tp.clSizeRatio)).min(tp.memDepth)).U
-  val elemNb = Reg(xsize.cloneType)
+  val elemNb = Reg(xsize)
 
   val sIdle :: sRead :: sDrain :: Nil = Enum(3)
   val state = RegInit(sIdle)
@@ -95,7 +95,7 @@
 
   io.vme_rd.data.ready := true.B
   val pipeDelayQueueDeqV = RegNext(io.vme_rd.data.valid, init = false.B)
-  val pipeDelayQueueDeqF = pipeDelayQueueDeqV // fire()
+  val pipeDelayQueueDeqF = pipeDelayQueueDeqV // fire
   val pipeDelayQueueDeqB = RegNext(io.vme_rd.data.bits)
 
   // Nb of CLs requestd, not received.
@@ -103,11 +103,11 @@
   val clInFlight = Reg(UInt(clCntIdxWdth.W))
   when(start) {
     clInFlight := 0.U
-  }.elsewhen(isBusy && io.vme_rd.cmd.fire() && !pipeDelayQueueDeqF) {
+  }.elsewhen(isBusy && io.vme_rd.cmd.fire && !pipeDelayQueueDeqF) {
     clInFlight := clInFlight + readLen
-  }.elsewhen(isBusy && io.vme_rd.cmd.fire() && pipeDelayQueueDeqF) {
+  }.elsewhen(isBusy && io.vme_rd.cmd.fire && pipeDelayQueueDeqF) {
     clInFlight := clInFlight + readLen - 1.U
-  }.elsewhen(isBusy && !io.vme_rd.cmd.fire() && pipeDelayQueueDeqF) {
+  }.elsewhen(isBusy && !io.vme_rd.cmd.fire && pipeDelayQueueDeqF) {
     assert(clInFlight > 0.U)
     clInFlight := clInFlight - 1.U
   }.otherwise {
@@ -184,7 +184,7 @@
     }
   }
   if (debug) {
-    when (io.vme_rd.data.fire()) {
+    when (io.vme_rd.data.fire) {
       printf(s"[TensorLoad] fetch data rdDataDestIdx:%x rdDataDestMask:%b\n",
         widx.asUInt,
         wmask.asUInt)
@@ -229,7 +229,7 @@
   dec.io.inst := readInstrPipe.io.deq.bits
   readInstrPipe.io.enq.valid := canRead
   readInstrPipe.io.enq.bits := rdata.asTypeOf(UInt(INST_BITS.W))
-  deqElem := readInstrPipe.io.enq.fire()
+  deqElem := readInstrPipe.io.enq.fire
   readInstrPipe.io.deq.ready := (
     (dec.io.isLoad & io.inst.ld.ready) ||
     (dec.io.isCompute & io.inst.co.ready) ||
@@ -293,13 +293,13 @@
     when(start) {
       printf("[Fetch] Launch\n")
     }
-    when(io.inst.ld.fire()) {
+    when(io.inst.ld.fire) {
       printf("[Fetch] [instruction decode] [L] %x\n", dec.io.inst)
     }
-    when(io.inst.co.fire()) {
+    when(io.inst.co.fire) {
       printf("[Fetch] [instruction decode] [C] %x\n", dec.io.inst)
     }
-    when(io.inst.st.fire()) {
+    when(io.inst.st.fire) {
       printf("[Fetch] [instruction decode] [S] %x\n", dec.io.inst)
     }
   }
@@ -339,7 +339,7 @@
   cmdGen.io.xpad_0 := 0.U
   cmdGen.io.xpad_1 := 0.U
   cmdGen.io.ypad_0 := 0.U
-  cmdGen.io.updateState := io.vmeCmd.fire()
+  cmdGen.io.updateState := io.vmeCmd.fire
   cmdGen.io.canSendCmd := true.B
 
   when(io.start) {
diff --git a/hardware/chisel/src/main/scala/core/LoadUop.scala b/hardware/chisel/src/main/scala/core/LoadUop.scala
index 74e08cd..a0ce49f 100644
--- a/hardware/chisel/src/main/scala/core/LoadUop.scala
+++ b/hardware/chisel/src/main/scala/core/LoadUop.scala
@@ -34,7 +34,6 @@
   val addrBits = log2Ceil(p(CoreKey).uopMemDepth)
   val idx = ValidIO(UInt(addrBits.W))
   val data = Flipped(ValidIO(new UopDecode))
-  override def cloneType = new UopMaster().asInstanceOf[this.type]
 }
 
 /** UopClient.
@@ -47,7 +46,6 @@
   val addrBits = log2Ceil(p(CoreKey).uopMemDepth)
   val idx = Flipped(ValidIO(UInt(addrBits.W)))
   val data = ValidIO(new UopDecode)
-  override def cloneType = new UopClient().asInstanceOf[this.type]
 }
 
 /** LoadUopTop.
diff --git a/hardware/chisel/src/main/scala/core/LoadUopSimple.scala b/hardware/chisel/src/main/scala/core/LoadUopSimple.scala
index 7de7b1b..f60ed67 100644
--- a/hardware/chisel/src/main/scala/core/LoadUopSimple.scala
+++ b/hardware/chisel/src/main/scala/core/LoadUopSimple.scala
@@ -134,7 +134,7 @@
 
   when(state =/= sReadData) {
     xcnt := 0.U
-  }.elsewhen(io.vme_rd.data.fire()) {
+  }.elsewhen(io.vme_rd.data.fire) {
     xcnt := xcnt + 1.U
   }
 
@@ -155,7 +155,7 @@
         waddr(1) := so
       }
     }
-  }.elsewhen(io.vme_rd.data.fire()) {
+  }.elsewhen(io.vme_rd.data.fire) {
     for (i <- 0 until uopsPerMemXfer) {
       waddr(i) := waddr(i) + 1.U
     }
@@ -169,7 +169,7 @@
     wmask(i) := true.B
   }
 
-  when (io.vme_rd.data.fire()) {
+  when (io.vme_rd.data.fire) {
     when (first) {
       first := false.B
 
@@ -201,7 +201,7 @@
     }
   }
 
-  when(io.vme_rd.data.fire()) {
+  when(io.vme_rd.data.fire) {
     for { i <- 0 until mems.size} {
       when (wmask(i)) {
         mems(i).write(waddr(i), wdata(i))
@@ -209,7 +209,7 @@
     }
   }
 
-  io.done := io.vme_rd.data.fire() & last
+  io.done := io.vme_rd.data.fire & last
 
   // ----------- read-from-sram -------------
 
@@ -243,7 +243,7 @@
 
   // debug
   if (debug) {
-    when(io.vme_rd.cmd.fire()) {
+    when(io.vme_rd.cmd.fire) {
       printf("[LoadUop] cmd addr:%x len:%x rem:%x\n", raddr, xlen, xrem)
     }
   }
diff --git a/hardware/chisel/src/main/scala/core/TensorAlu.scala b/hardware/chisel/src/main/scala/core/TensorAlu.scala
index 8d7aa31..e7d67d1 100644
--- a/hardware/chisel/src/main/scala/core/TensorAlu.scala
+++ b/hardware/chisel/src/main/scala/core/TensorAlu.scala
@@ -286,13 +286,13 @@
 
   require(io.out.splitWidth == 1 && io.out.splitLength == 1, "-F- Out split write is not supported")
   val numVecUnits = dataSplitFactor
-  val outData = Wire(io.out.wr(0).bits.data.cloneType)
-  val dataRemapB = Wire(Vec(numVecUnits, io.acc.rd(0).data.bits.cloneType))
-  val dataRemapA = Wire(Vec(numVecUnits, io.acc.rd(0).data.bits.cloneType))
+  val outData = Wire(chiselTypeOf(io.out.wr(0).bits.data))
+  val dataRemapB = Wire(Vec(numVecUnits, chiselTypeOf(io.acc.rd(0).data.bits)))
+  val dataRemapA = Wire(Vec(numVecUnits, chiselTypeOf(io.acc.rd(0).data.bits)))
   // numVecUnits is a pow of 2
   // split dec bits pipe further if there are many vecUnits
   val decSplitNb0 =  if (numVecUnits < 8) 1 else 2
-  val decSplit0 = Wire(Vec(decSplitNb0, io.dec.cloneType))
+  val decSplit0 = Wire(Vec(decSplitNb0, chiselTypeOf(io.dec)))
   for (idx <- 0 until decSplitNb0) {
     decSplit0(idx) := ShiftRegister(io.dec, if(aluDataReadPipeDelay < 2) 0 else 1)
   }
diff --git a/hardware/chisel/src/main/scala/core/TensorLoadNarrowVME.scala b/hardware/chisel/src/main/scala/core/TensorLoadNarrowVME.scala
index 746e54f..1a249e1 100644
--- a/hardware/chisel/src/main/scala/core/TensorLoadNarrowVME.scala
+++ b/hardware/chisel/src/main/scala/core/TensorLoadNarrowVME.scala
@@ -87,11 +87,11 @@
   val blocksInFlight = Reg(UInt(blkIdxWdth.W))
   when(io.start) {
     blocksInFlight := 0.U
-  }.elsewhen(isBusy && io.vme_rd.cmd.fire() && !vmeDataFirePipe) {
+  }.elsewhen(isBusy && io.vme_rd.cmd.fire && !vmeDataFirePipe) {
     blocksInFlight := blocksInFlight + readLen
-  }.elsewhen(isBusy && io.vme_rd.cmd.fire() && vmeDataFirePipe) {
+  }.elsewhen(isBusy && io.vme_rd.cmd.fire && vmeDataFirePipe) {
     blocksInFlight := blocksInFlight + readLen - 1.U
-  }.elsewhen(isBusy && !io.vme_rd.cmd.fire() && vmeDataFirePipe) {
+  }.elsewhen(isBusy && !io.vme_rd.cmd.fire && vmeDataFirePipe) {
     assert(blocksInFlight > 0.U)
     blocksInFlight := blocksInFlight - 1.U
   }.otherwise {
@@ -511,14 +511,14 @@
     init = false.B)
   when(io.start) {
     vmeTagDecodeLastValid :=false.B // reset tag valid
-  }.elsewhen(io.vmeData.fire()) {
+  }.elsewhen(io.vmeData.fire) {
     vmeTagDecodeLastValid := true.B // set tag valid on a new read
   }.otherwise {
     vmeTagDecodeLastValid := vmeTagDecodeLastValidNext // keep value
   }
   rdDataDestCol := DontCare
   rdDataDestIdx := DontCare
-  when(io.vmeData.fire()) {
+  when(io.vmeData.fire) {
     when (
       !vmeTagDecodeLastValidNext ||
       (vmeTagDecodeLastValidNext &&
@@ -607,7 +607,7 @@
   when (io.start || stride) {
     blocksReadNb := 0.U
     commandsDone := false.B
-  }.elsewhen (io.vmeCmd.fire()) {
+  }.elsewhen (io.vmeCmd.fire) {
     val nextBlRNb = blocksReadNb + readLen
     blocksReadNb := nextBlRNb // THIS IS WHEN A NEW VME CMD HAPPENS
     when (nextBlRNb === blocksReadSize && srcRowIdx === dec.ysize - 1.U) {
@@ -618,7 +618,7 @@
   }
 
   //when the whole xsize row read commands send, go for the next src row
-  when((blocksReadNb === blocksReadSize - readLen) && (srcRowIdx =/= dec.ysize - 1.U) && io.vmeCmd.fire()) {
+  when((blocksReadNb === blocksReadSize - readLen) && (srcRowIdx =/= dec.ysize - 1.U) && io.vmeCmd.fire) {
     stride := true.B
   }.otherwise {
     stride := false.B
@@ -667,7 +667,7 @@
     rdCmdExtAddr := xfer_init_addr
     rdCmdExtAddrRowBegin := xfer_init_addr
     newReadRow := true.B
-  }.elsewhen (io.vmeCmd.fire()) {
+  }.elsewhen (io.vmeCmd.fire) {
     when(stride) {
       val memRow = rdCmdExtAddrRowBegin + (dec.xstride << log2Ceil(elemBytes))
       rdCmdExtAddr := memRow //  go to the next source matrix row with xstride tensors offset
@@ -709,7 +709,7 @@
     when(startIssueCmdRead) {
       rdCmdDestBlockIdx := rdCmdStartIdx << blkOffset // it is aligned by tensor size
       rdCmdDestBlockIdxNext:= rdCmdDestBlockIdx + readLen
-    }.elsewhen (io.vmeCmd.fire()) {
+    }.elsewhen (io.vmeCmd.fire) {
       // increment block position by transaction length
       rdCmdDestBlockIdxNext:= rdCmdDestBlockIdxNext + readLen
     }
@@ -717,7 +717,7 @@
     rdCmdValid := false.B
   }
   if(debug) {
-    when (io.vmeCmd.fire()) {
+    when (io.vmeCmd.fire) {
       printf(s"[GenVMECmd] $tensorType cmd data rdCmdDestBlockIdx:%b " +
         s" length:%d \n",
         rdCmdDestBlockIdx,
diff --git a/hardware/chisel/src/main/scala/core/TensorLoadSimple.scala b/hardware/chisel/src/main/scala/core/TensorLoadSimple.scala
index c2ef007..eb2480d 100644
--- a/hardware/chisel/src/main/scala/core/TensorLoadSimple.scala
+++ b/hardware/chisel/src/main/scala/core/TensorLoadSimple.scala
@@ -156,13 +156,13 @@
   dataCtrl.io.start := state === sIdle & io.start
   dataCtrl.io.inst := io.inst
   dataCtrl.io.baddr := io.baddr
-  dataCtrl.io.xinit := io.vme_rd.cmd.fire()
-  dataCtrl.io.xupdate := io.vme_rd.data.fire()
-  dataCtrl.io.yupdate := io.vme_rd.data.fire()
+  dataCtrl.io.xinit := io.vme_rd.cmd.fire
+  dataCtrl.io.xupdate := io.vme_rd.data.fire
+  dataCtrl.io.yupdate := io.vme_rd.data.fire
 
   when(state === sIdle) {
     dataCtrlDone := false.B
-  }.elsewhen(io.vme_rd.data.fire() && dataCtrl.io.done) {
+  }.elsewhen(io.vme_rd.data.fire && dataCtrl.io.done) {
     dataCtrlDone := true.B
   }
 
@@ -170,16 +170,16 @@
   yPadCtrl0.io.start := dec.ypad_0 =/= 0.U & state === sIdle & io.start
 
   yPadCtrl1.io.start := dec.ypad_1 =/= 0.U &
-    ((io.vme_rd.data.fire() & dataCtrl.io.done & dec.xpad_1 === 0.U) |
+    ((io.vme_rd.data.fire & dataCtrl.io.done & dec.xpad_1 === 0.U) |
       (state === sXPad1 & xPadCtrl1.io.done & dataCtrlDone))
 
   xPadCtrl0.io.start := dec.xpad_0 =/= 0.U &
     ((state === sIdle & io.start) |
       (state === sYPad0 & yPadCtrl0.io.done) |
-      (io.vme_rd.data.fire() & ~dataCtrlDone & dataCtrl.io.stride & dec.xpad_1 === 0.U) |
+      (io.vme_rd.data.fire & ~dataCtrlDone & dataCtrl.io.stride & dec.xpad_1 === 0.U) |
       (state === sXPad1 & xPadCtrl1.io.done & ~dataCtrlDone))
 
-  xPadCtrl1.io.start := dec.xpad_1 =/= 0.U & io.vme_rd.data.fire() &
+  xPadCtrl1.io.start := dec.xpad_1 =/= 0.U & io.vme_rd.data.fire &
     ((dataCtrl.io.done) | (~dataCtrl.io.done & dataCtrl.io.stride & dec.xpad_1 =/= 0.U))
 
   yPadCtrl0.io.inst := io.inst
@@ -205,14 +205,14 @@
     tag := tag
   }.elsewhen(state === sIdle || state === sReadCmd || tag === (tp.numMemBlock - 1).U) {
     tag := 0.U
-  }.elsewhen(io.vme_rd.data.fire() || isZeroPad) {
+  }.elsewhen(io.vme_rd.data.fire || isZeroPad) {
     tag := tag + 1.U
   }
 
   when(state === sIdle || (dataCtrlDone && ~isZeroPad) ||
     (set === (tp.tensorLength - 1).U && tag === (tp.numMemBlock - 1).U)) {
     set := 0.U
-  }.elsewhen((io.vme_rd.data.fire() || isZeroPad) && tag === (tp.numMemBlock - 1).U) {
+  }.elsewhen((io.vme_rd.data.fire || isZeroPad) && tag === (tp.numMemBlock - 1).U) {
     set := set + 1.U
   }
 
@@ -221,12 +221,12 @@
   when(state === sIdle) {
     waddr_cur := dec.sram_offset
     waddr_nxt := dec.sram_offset
-  }.elsewhen((io.vme_rd.data.fire() || isZeroPad)
+  }.elsewhen((io.vme_rd.data.fire || isZeroPad)
     && set === (tp.tensorLength - 1).U
     && tag === (tp.numMemBlock - 1).U)
   {
     waddr_cur := waddr_cur + 1.U
-  }.elsewhen(dataCtrl.io.stride && io.vme_rd.data.fire()) {
+  }.elsewhen(dataCtrl.io.stride && io.vme_rd.data.fire) {
     waddr_cur := waddr_nxt + dec.xsize
     waddr_nxt := waddr_nxt + dec.xsize
   }
@@ -267,7 +267,7 @@
     val muxWen =
       Mux(state === sIdle,
         io.tensor.wr(0).valid,
-        (io.vme_rd.data.fire() | isZeroPad) & set === i.U)
+        (io.vme_rd.data.fire | isZeroPad) & set === i.U)
     val muxWaddr = Mux(state === sIdle, io.tensor.wr(0).bits.idx, waddr_cur)
     val muxWdata = Mux(state === sIdle, tdata, wdata(i))
     val muxWmask = Mux(state === sIdle, no_mask, wmask(i))
@@ -288,7 +288,7 @@
   }
 
   // done
-  val done_no_pad = io.vme_rd.data.fire() & dataCtrl.io.done & dec.xpad_1 === 0.U & dec.ypad_1 === 0.U
+  val done_no_pad = io.vme_rd.data.fire & dataCtrl.io.done & dec.xpad_1 === 0.U & dec.ypad_1 === 0.U
   val done_x_pad = state === sXPad1 & xPadCtrl1.io.done & dataCtrlDone & dec.ypad_1 === 0.U
   val done_y_pad = state === sYPad1 & dataCtrlDone & yPadCtrl1.io.done
   io.done := done_no_pad | done_x_pad | done_y_pad
@@ -296,7 +296,7 @@
   // debug
   if (debug) {
     if (tensorType == "inp") {
-      when(io.vme_rd.cmd.fire()) {
+      when(io.vme_rd.cmd.fire) {
         printf("[TensorLoad] [inp] cmd addr:%x len:%x\n",
           dataCtrl.io.addr,
           dataCtrl.io.len)
@@ -314,7 +314,7 @@
         printf("[TensorLoad] [inp] sXPad1\n")
       }
     } else if (tensorType == "wgt") {
-      when(io.vme_rd.cmd.fire()) {
+      when(io.vme_rd.cmd.fire) {
         printf("[TensorLoad] [wgt] cmd addr:%x len:%x\n",
           dataCtrl.io.addr,
           dataCtrl.io.len)
@@ -332,7 +332,7 @@
         printf("[TensorLoad] [wgt] sXPad1\n")
       }
     } else if (tensorType == "acc") {
-      when(io.vme_rd.cmd.fire()) {
+      when(io.vme_rd.cmd.fire) {
         printf("[TensorLoad] [acc] cmd addr:%x len:%x\n",
           dataCtrl.io.addr,
           dataCtrl.io.len)
diff --git a/hardware/chisel/src/main/scala/core/TensorLoadWideVME.scala b/hardware/chisel/src/main/scala/core/TensorLoadWideVME.scala
index f50530a..3ad8259 100644
--- a/hardware/chisel/src/main/scala/core/TensorLoadWideVME.scala
+++ b/hardware/chisel/src/main/scala/core/TensorLoadWideVME.scala
@@ -132,11 +132,11 @@
   val clInFlight = Reg(UInt(clCntIdxWdth.W))
   when(io.start) {
     clInFlight := 0.U
-  }.elsewhen(isBusy && io.vme_rd.cmd.fire() && !vmeDataFirePipe) {
+  }.elsewhen(isBusy && io.vme_rd.cmd.fire && !vmeDataFirePipe) {
     clInFlight := clInFlight + readLen
-  }.elsewhen(isBusy && io.vme_rd.cmd.fire() && vmeDataFirePipe) {
+  }.elsewhen(isBusy && io.vme_rd.cmd.fire && vmeDataFirePipe) {
     clInFlight := clInFlight + readLen - 1.U
-  }.elsewhen(isBusy && !io.vme_rd.cmd.fire() && vmeDataFirePipe) {
+  }.elsewhen(isBusy && !io.vme_rd.cmd.fire && vmeDataFirePipe) {
     assert(clInFlight > 0.U)
     clInFlight := clInFlight - 1.U
   }.otherwise {
@@ -392,7 +392,7 @@
     init = false.B)
   when(io.start) {
     vmeTagDecodeLastValid :=false.B // reset tag valid
-  }.elsewhen(io.vmeData.fire()) {
+  }.elsewhen(io.vmeData.fire) {
     vmeTagDecodeLastValid := true.B // set tag valid on a new read
   }.otherwise {
     vmeTagDecodeLastValid := vmeTagDecodeLastValidNext // keep value
@@ -411,10 +411,10 @@
           isLastPulse,
           wrMaskLast,
           ((1 << wmaskWidth) - 1).U)))
-  val wmask = Mux(io.vmeData.fire(), wmaskSel, 0.U)
+  val wmask = Mux(io.vmeData.fire, wmaskSel, 0.U)
   rdDataElemDestIdx := DontCare
   isFirstPulse := false.B
-  when(io.vmeData.fire()) {
+  when(io.vmeData.fire) {
     when (
       !vmeTagDecodeLastValidNext ||
       (vmeTagDecodeLastValidNext &&
@@ -760,6 +760,6 @@
   cmdGen.io.xpad_0 := dec.xpad_0
   cmdGen.io.xpad_1 := dec.xpad_1
   cmdGen.io.ypad_0 := dec.ypad_0
-  cmdGen.io.updateState := io.vmeCmd.fire()
+  cmdGen.io.updateState := io.vmeCmd.fire
   cmdGen.io.canSendCmd := true.B
 }
diff --git a/hardware/chisel/src/main/scala/core/TensorStoreNarrowVME.scala b/hardware/chisel/src/main/scala/core/TensorStoreNarrowVME.scala
index bbef36d..6ab698d 100644
--- a/hardware/chisel/src/main/scala/core/TensorStoreNarrowVME.scala
+++ b/hardware/chisel/src/main/scala/core/TensorStoreNarrowVME.scala
@@ -197,14 +197,14 @@
 
   when(state === sWriteCmd || tag === (numMemBlock - 1).U) {
     tag := 0.U
-  }.elsewhen(io.vme_wr.data.fire()) {
+  }.elsewhen(io.vme_wr.data.fire) {
     tag := tag + 1.U
   }
 
   when(
     state === sWriteCmd || (state =/= sReadMem && set === (tensorLength - 1).U && tag === (numMemBlock - 1).U)) {
     set := 0.U
-  }.elsewhen(io.vme_wr.data.fire() && tag === (numMemBlock - 1).U) {
+  }.elsewhen(io.vme_wr.data.fire && tag === (numMemBlock - 1).U) {
     set := set + 1.U
   }
 
@@ -213,7 +213,7 @@
   when(state === sIdle) {
     raddr_cur := dec.sram_offset
     raddr_nxt := dec.sram_offset
-  }.elsewhen(io.vme_wr.data.fire() && set === (tensorLength - 1).U && tag === (numMemBlock - 1).U) {
+  }.elsewhen(io.vme_wr.data.fire && set === (tensorLength - 1).U && tag === (numMemBlock - 1).U) {
     raddr_cur := raddr_cur + 1.U
   }.elsewhen(stride) {
     raddr_cur := raddr_nxt + dec.xsize
@@ -248,7 +248,7 @@
 
   when(state === sWriteCmd) {
     xcnt := 0.U
-  }.elsewhen(io.vme_wr.data.fire()) {
+  }.elsewhen(io.vme_wr.data.fire) {
     xcnt := xcnt + 1.U
   }
 
@@ -260,11 +260,11 @@
 
   // debug
   if (debug) {
-    when(io.vme_wr.cmd.fire()) {
+    when(io.vme_wr.cmd.fire) {
       printf("[TensorStore] ysize:%x ycnt:%x raddr:%x waddr:%x len:%x rem:%x\n",
         ysize, ycnt, raddr_cur, waddr_cur, xlen, xrem)
     }
-    when(io.vme_wr.data.fire()) {
+    when(io.vme_wr.data.fire) {
       printf("[TensorStore] data:%x\n", io.vme_wr.data.bits.data)
       printf("[TensorStore] strb:%x\n", io.vme_wr.data.bits.strb)
     }
diff --git a/hardware/chisel/src/main/scala/core/TensorStoreWideVME.scala b/hardware/chisel/src/main/scala/core/TensorStoreWideVME.scala
index 8a3cea3..24b9618 100644
--- a/hardware/chisel/src/main/scala/core/TensorStoreWideVME.scala
+++ b/hardware/chisel/src/main/scala/core/TensorStoreWideVME.scala
@@ -114,14 +114,14 @@
       }
     }
     is(sWriteCmd) {
-      when(io.vme_wr.cmd.fire()) {
+      when(io.vme_wr.cmd.fire) {
         state := sWriteData
         updateState := true.B
         xcnt := 0.U
       }
     }
     is(sWriteData) {
-      when(io.vme_wr.data.fire()) {
+      when(io.vme_wr.data.fire) {
         when(xcnt === readLen - 1.U) {
           state := sWriteAck
         }.otherwise {
@@ -181,10 +181,10 @@
   //
   //  SRAM !-tz-.=TZ=.-TZ-!-TZ-.-TZ-.-tz-!
 
-  val isFirstPulse = io.vme_wr.data.fire() && xcnt === 0.U
+  val isFirstPulse = io.vme_wr.data.fire && xcnt === 0.U
   assert(state =/= sWriteData || readLen > 0.U)
   val firstPulseTenzorsNb = tp.clSizeRatio.U - fstPulseDataStart
-  val isLastPulse = io.vme_wr.data.fire() && xcnt === readLen - 1.U
+  val isLastPulse = io.vme_wr.data.fire && xcnt === readLen - 1.U
   val spReadAddrReg = Reg(UInt(M_SRAM_OFFSET_BITS.W))
   val spReadAddr = Wire(spReadAddrReg.cloneType)
   val srcElemOffsetReg = Reg(UInt(log2Ceil(tp.clSizeRatio).W))
@@ -197,7 +197,7 @@
     spReadAddrReg := spReadAddr + incrFstIdx
     srcElemOffset := spElemIdx % tp.clSizeRatio.U
     srcElemOffsetReg := (spElemIdx + firstPulseTenzorsNb) % tp.clSizeRatio.U
-  }.elsewhen(io.vme_wr.data.fire()) {
+  }.elsewhen(io.vme_wr.data.fire) {
     spReadAddrReg := spReadAddrReg + 1.U
     spReadAddr := spReadAddrReg
     srcElemOffset := (spElemIdx + firstPulseTenzorsNb) % tp.clSizeRatio.U
@@ -231,7 +231,7 @@
     srcData(i) := VecInit(for (grpIdx <- 0 until splitDataFactor) yield {
       tensorFile(i*splitDataFactor + grpIdx).read(
         srcMemIdx(i),
-        state === sWriteCmd | (state === sWriteData && io.vme_wr.data.fire()))
+        state === sWriteCmd | (state === sWriteData && io.vme_wr.data.fire))
     }).asTypeOf(UInt(tp.tensorSizeBits.W))
 
     // crossbar src to dst
@@ -278,7 +278,7 @@
 
   // debug
   if (debug) {
-    when(io.vme_wr.data.fire()) {
+    when(io.vme_wr.data.fire) {
       printf("[TensorStore] data:%x\n", io.vme_wr.data.bits.data)
       printf("[TensorStore] strb:%x\n", io.vme_wr.data.bits.strb)
     }
diff --git a/hardware/chisel/src/main/scala/core/TensorUtil.scala b/hardware/chisel/src/main/scala/core/TensorUtil.scala
index 6e79548..c19262a 100644
--- a/hardware/chisel/src/main/scala/core/TensorUtil.scala
+++ b/hardware/chisel/src/main/scala/core/TensorUtil.scala
@@ -283,8 +283,6 @@
       }
     }
   }
-  override def cloneType =
-    new TensorMaster(tensorType).asInstanceOf[this.type]
 }
 
 /** TensorClient.
@@ -325,8 +323,6 @@
       }
     }
   }
-  override def cloneType =
-    new TensorClient(tensorType).asInstanceOf[this.type]
 }
 
 /** TensorMasterData.
@@ -339,8 +335,6 @@
   (implicit p: Parameters) extends TensorParams(tensorType) {
   val data = Flipped(
     ValidIO(Vec(lenSplit, Vec(widthSplit, UInt(tensorElemBits.W)))))
-  override def cloneType =
-    new TensorMasterData(tensorType).asInstanceOf[this.type]
 }
 
 /** TensorClientData.
@@ -353,8 +347,6 @@
   (implicit p: Parameters) extends TensorParams(tensorType) {
   val data = ValidIO(
     Vec(lenSplit, Vec(widthSplit, UInt(tensorElemBits.W))))
-  override def cloneType =
-    new TensorClientData(tensorType).asInstanceOf[this.type]
 }
 
 /** TensorPadCtrl. Zero-padding controller for TensorLoad. */
diff --git a/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala b/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala
index 5c7c5ee..b3815cd 100644
--- a/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala
+++ b/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala
@@ -153,10 +153,10 @@
     when(state === sReadAddress && io.axi.ar.ready) {
       printf("[VTAHostDPIToAXI] [AR] addr:%x\n", addr)
     }
-    when(io.axi.r.fire()) {
+    when(io.axi.r.fire) {
       printf("[VTAHostDPIToAXI] [R] value:%x\n", io.axi.r.bits.data)
     }
-    when(io.axi.w.fire()) {
+    when(io.axi.w.fire) {
       printf("[VTAHostDPIToAXI] [W] value:%x\n", io.axi.w.bits.data)
     }
   }
diff --git a/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala b/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala
index 48a2e9a..d53ad1c 100644
--- a/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala
+++ b/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala
@@ -50,15 +50,11 @@
 class VTAMemDPIData(implicit val p: Parameters) extends Bundle {
   val data = UInt(p(ShellKey).memParams.dataBits.W)
   val id   = UInt(p(ShellKey).memParams.idBits.W)
-  override def cloneType =
-  new VTAMemDPIData().asInstanceOf[this.type]
 }
 
 class VTAMemDPIWrData(implicit val p: Parameters) extends Bundle {
   val data = UInt(p(ShellKey).memParams.dataBits.W)
   val strb = UInt((p(ShellKey).memParams.dataBits/8).W)
-  override def cloneType =
-  new VTAMemDPIWrData().asInstanceOf[this.type]
 }
 
 
diff --git a/hardware/chisel/src/main/scala/shell/SimShell.scala b/hardware/chisel/src/main/scala/shell/SimShell.scala
index 927f8af..5474654 100644
--- a/hardware/chisel/src/main/scala/shell/SimShell.scala
+++ b/hardware/chisel/src/main/scala/shell/SimShell.scala
@@ -68,7 +68,7 @@
  * the simulation thread when it is asserted and resume it when it is
  * de-asserted.
  */
-class VTASim(implicit p: Parameters) extends MultiIOModule {
+class VTASim(implicit p: Parameters) extends Module {
   val sim_wait = IO(Output(Bool()))
   val sim = Module(new VTASimDPI)
   sim.io.reset := reset
@@ -82,7 +82,7 @@
  * are connected to the VTAShell. An extra clock, sim_clock, is used to eval
  * the VTASim DPI function when the main simulation clock is on halt state.
  */
-class SimShell(implicit p: Parameters) extends MultiIOModule {
+class SimShell(implicit p: Parameters) extends Module {
   val mem = IO(new AXIClient(p(ShellKey).memParams))
   val host = IO(new AXILiteMaster(p(ShellKey).hostParams))
   val sim_clock = IO(Input(Clock()))
diff --git a/hardware/chisel/src/main/scala/shell/VCR.scala b/hardware/chisel/src/main/scala/shell/VCR.scala
index 9a80cd7..3046712 100644
--- a/hardware/chisel/src/main/scala/shell/VCR.scala
+++ b/hardware/chisel/src/main/scala/shell/VCR.scala
@@ -131,7 +131,7 @@
     }
   }
 
-  when(io.host.aw.fire()) { waddr := io.host.aw.bits.addr }
+  when(io.host.aw.fire) { waddr := io.host.aw.bits.addr }
 
   io.host.aw.ready := wstate === sWriteAddress
   io.host.w.ready := wstate === sWriteData
@@ -158,25 +158,25 @@
 
   when(io.vcr.finish) {
     reg(0) := "b_10".U
-  }.elsewhen(io.host.w.fire() && addr(0).U === waddr) {
+  }.elsewhen(io.host.w.fire && addr(0).U === waddr) {
     reg(0) := wdata
   }
 
   for (i <- 0 until vp.nECnt) {
     when(io.vcr.ecnt(i).valid) {
       reg(eo + i) := io.vcr.ecnt(i).bits
-    }.elsewhen(io.host.w.fire() && addr(eo + i).U === waddr) {
+    }.elsewhen(io.host.w.fire && addr(eo + i).U === waddr) {
       reg(eo + i) := wdata
     }
   }
 
   for (i <- 0 until (vp.nVals + nPtrs)) {
-    when(io.host.w.fire() && addr(vo + i).U === waddr) {
+    when(io.host.w.fire && addr(vo + i).U === waddr) {
       reg(vo + i) := wdata
     }
   }
 
-  when(io.host.ar.fire()) {
+  when(io.host.ar.fire) {
     rdata := MuxLookup(io.host.ar.bits.addr, 0.U, reg_map)
   }
 
@@ -199,7 +199,7 @@
   for (i <- 0 until vp.nUCnt) {
     when(io.vcr.ucnt(i).valid) {
       reg(uo + i) := io.vcr.ucnt(i).bits
-    }.elsewhen(io.host.w.fire() && addr(uo + i).U === waddr) {
+    }.elsewhen(io.host.w.fire && addr(uo + i).U === waddr) {
       reg(uo + i) := wdata
     }
   }
diff --git a/hardware/chisel/src/main/scala/shell/VME.scala b/hardware/chisel/src/main/scala/shell/VME.scala
index 77dc069..5b4a1da 100644
--- a/hardware/chisel/src/main/scala/shell/VME.scala
+++ b/hardware/chisel/src/main/scala/shell/VME.scala
@@ -62,8 +62,6 @@
   val client_id  = UInt(clientBits.W)
   val client_tag = UInt(p(ShellKey).vmeParams.clientTagBitWidth.W)
   val client_mask = UInt(RequestQueueMaskBits.W)
-  override def cloneType =
-  new clientTag().asInstanceOf[this.type]
 }
 
 class VMECmd(implicit p: Parameters) extends VMEBase {
@@ -84,8 +82,6 @@
   val data = UInt(dataBits.W)
   val tag = UInt(p(ShellKey).vmeParams.clientTagBitWidth.W)
   val last = Bool()
-  override def cloneType =
-  new VMEData().asInstanceOf[this.type]
 }
 
 /** VMEReadMaster.
@@ -97,8 +93,6 @@
   val dataBits = p(ShellKey).memParams.dataBits
   val cmd = Decoupled(new VMECmd)
   val data = Flipped(Decoupled(new VMEData))
-  override def cloneType =
-  new VMEReadMaster().asInstanceOf[this.type]
 }
 
 /** VMEReadClient.
@@ -110,8 +104,6 @@
   val dataBits = p(ShellKey).memParams.dataBits
   val cmd = Flipped(Decoupled(new VMECmd))
   val data = Decoupled(new VMEData)
-  override def cloneType =
-  new VMEReadClient().asInstanceOf[this.type]
 }
 
 /** VMEWriteData.
@@ -125,9 +117,6 @@
 
   val data = UInt(dataBits.W)
   val strb = UInt(strbBits.W)
-
-  override def cloneType =
-  new VMEWriteData().asInstanceOf[this.type]
 }
 
 /** VMEWriteMaster.
@@ -140,8 +129,6 @@
   val cmd = Decoupled(new VMECmd)
   val data = Decoupled(new VMEWriteData)
   val ack = Input(Bool())
-  override def cloneType =
-  new VMEWriteMaster().asInstanceOf[this.type]
 }
 
 /** VMEWriteClient.
@@ -154,8 +141,6 @@
   val cmd = Flipped(Decoupled(new VMECmd))
   val data = Flipped(Decoupled(new VMEWriteData))
   val ack = Output(Bool())
-  override def cloneType =
-  new VMEWriteClient().asInstanceOf[this.type]
 }
 
 /** VMEMaster.
@@ -220,7 +205,7 @@
   }.otherwise{
   availableEntriesNext:= availableEntries
   }
-  when(reset.toBool){
+  when(reset.asBool){
   availableEntries := VecInit(Seq.fill(RequestQueueDepth)(true.B)).asUInt
   updateEntry := 0.U
   }.otherwise{
@@ -325,7 +310,7 @@
   val wstate = RegInit(sWriteIdle)
   val wr_cnt = RegInit(0.U(lenBits.W))
   io.vme.wr(0).cmd.ready := wstate === sWriteIdle
-  io.vme.wr(0).ack := io.mem.b.fire()
+  io.vme.wr(0).ack := io.mem.b.fire
   io.vme.wr(0).data.ready := wstate === sWriteData & io.mem.w.ready
   io.mem.aw.valid := wstate === sWriteAddr
   io.mem.aw.bits.addr := wr_addr
@@ -337,14 +322,14 @@
   io.mem.w.bits.last := wr_cnt === wr_len
   io.mem.w.bits.id   := p(ShellKey).memParams.idConst.U // no support for multiple writes
   io.mem.b.ready := wstate === sWriteResp
-  when(io.vme.wr(0).cmd.fire()) {
+  when(io.vme.wr(0).cmd.fire) {
     wr_len := io.vme.wr(0).cmd.bits.len
     wr_addr := io.vme.wr(0).cmd.bits.addr
   }
   when(wstate === sWriteIdle) {
     wr_cnt := 0.U
   }
-  .elsewhen(io.mem.w.fire()){
+  .elsewhen(io.mem.w.fire){
     wr_cnt := wr_cnt + 1.U
   }
   switch(wstate){
diff --git a/hardware/chisel/src/main/scala/shell/VMESimple.scala b/hardware/chisel/src/main/scala/shell/VMESimple.scala
index e430d81..aded8ce 100644
--- a/hardware/chisel/src/main/scala/shell/VMESimple.scala
+++ b/hardware/chisel/src/main/scala/shell/VMESimple.scala
@@ -38,7 +38,7 @@
 
   val nReadClients = p(ShellKey).vmeParams.nReadClients
   val rd_arb = Module(new Arbiter(new VMECmd, nReadClients))
-  val rd_arb_chosen = RegEnable(rd_arb.io.chosen, rd_arb.io.out.fire())
+  val rd_arb_chosen = RegEnable(rd_arb.io.chosen, rd_arb.io.out.fire)
 
   for (i <- 0 until nReadClients) { rd_arb.io.in(i) <> io.vme.rd(i).cmd }
 
@@ -57,7 +57,7 @@
       }
     }
     is(sReadData) {
-      when(io.mem.r.fire() && io.mem.r.bits.last) {
+      when(io.mem.r.fire && io.mem.r.bits.last) {
         rstate := sReadIdle
       }
     }
@@ -71,7 +71,7 @@
 
   when(wstate === sWriteIdle) {
     wr_cnt := 0.U
-  }.elsewhen(io.mem.w.fire()) {
+  }.elsewhen(io.mem.w.fire) {
     wr_cnt := wr_cnt + 1.U
   }
 
@@ -109,12 +109,12 @@
   val rd_addr = RegInit(0.U(addrBits.W))
   val wr_addr = RegInit(0.U(addrBits.W))
 
-  when(rd_arb.io.out.fire()) {
+  when(rd_arb.io.out.fire) {
     rd_len := rd_arb.io.out.bits.len
     rd_addr := rd_arb.io.out.bits.addr
   }
 
-  when(io.vme.wr(0).cmd.fire()) {
+  when(io.vme.wr(0).cmd.fire) {
     wr_len := io.vme.wr(0).cmd.bits.len
     wr_addr := io.vme.wr(0).cmd.bits.addr
   }
@@ -130,13 +130,13 @@
     io.vme.rd(i).data.bits.last := io.mem.r.bits.last
     io.vme.rd(i).data.bits.tag := localTag(i)
 
-    when (io.vme.rd(i).cmd.fire()) {
+    when (io.vme.rd(i).cmd.fire) {
       localTag(i) := io.vme.rd(i).cmd.bits.tag
     }
   }
 
   io.vme.wr(0).cmd.ready := wstate === sWriteIdle
-  io.vme.wr(0).ack := io.mem.b.fire()
+  io.vme.wr(0).ack := io.mem.b.fire
   io.vme.wr(0).data.ready := wstate === sWriteData & io.mem.w.ready
 
   // mem
diff --git a/hardware/chisel/src/main/scala/test/Test.scala b/hardware/chisel/src/main/scala/test/Test.scala
index 14fe1c2..98615c6 100644
--- a/hardware/chisel/src/main/scala/test/Test.scala
+++ b/hardware/chisel/src/main/scala/test/Test.scala
@@ -24,7 +24,7 @@
 import vta.shell._
 
 /** Test. This generates a testbench file for simulation */
-class Test(implicit p: Parameters) extends MultiIOModule {
+class Test(implicit p: Parameters) extends Module {
   val sim_clock = IO(Input(Clock()))
   val sim_wait = IO(Output(Bool()))
   val sim_shell = Module(new SimShell)
diff --git a/hardware/chisel/src/main/scala/util/GenericParameterizedBundle.scala b/hardware/chisel/src/main/scala/util/GenericParameterizedBundle.scala
index 063e766..27ffb11 100644
--- a/hardware/chisel/src/main/scala/util/GenericParameterizedBundle.scala
+++ b/hardware/chisel/src/main/scala/util/GenericParameterizedBundle.scala
@@ -23,22 +23,4 @@
 
 import chisel3._
 
-abstract class GenericParameterizedBundle[+T <: Object]
-  (val params: T) extends Bundle {
-  override def cloneType = {
-    try {
-      this.getClass.getConstructors.head
-        .newInstance(params)
-        .asInstanceOf[this.type]
-    } catch {
-      case e: java.lang.IllegalArgumentException =>
-        throw new Exception(
-          "Unable to use GenericParameterizedBundle.cloneType on " +
-            this.getClass + ", probably because " + this.getClass +
-            "() takes more than one argument.  Consider overriding " +
-            "cloneType() on " + this.getClass,
-          e
-        )
-    }
-  }
-}
+abstract class GenericParameterizedBundle[+T <: Object](val params: T) extends Bundle {}
diff --git a/hardware/chisel/src/main/scala/util/SyncQueue.scala b/hardware/chisel/src/main/scala/util/SyncQueue.scala
index 1e77c99..606c869 100644
--- a/hardware/chisel/src/main/scala/util/SyncQueue.scala
+++ b/hardware/chisel/src/main/scala/util/SyncQueue.scala
@@ -105,18 +105,18 @@
 
   io.deq <> buffer.io.deq
   doubleQueue.io.enq.bits := io.enq.bits
-  doubleQueue.io.enq.valid := io.enq.fire() && (!buffer.io.enq.ready || doubleQueueHasValues)
+  doubleQueue.io.enq.valid := io.enq.fire && (!buffer.io.enq.ready || doubleQueueHasValues)
   doubleQueue.io.deq.ready := buffer.io.enq.ready
 
   val count = Wire(UInt(log2Up(entries + 1).W))
   val countNext = RegEnable(
     next = count,
     init = 0.U,
-    enable = io.enq.fire() || io.deq.fire())
-  when (io.enq.fire() && !io.deq.fire()) {
+    enable = io.enq.fire || io.deq.fire)
+  when (io.enq.fire && !io.deq.fire) {
     assert(countNext < entries.U)
     count := countNext + 1.U
-  }.elsewhen (!io.enq.fire() && io.deq.fire()) {
+  }.elsewhen (!io.enq.fire && io.deq.fire) {
     assert(countNext > 0.U)
     count := countNext - 1.U
   }.otherwise {
@@ -180,18 +180,18 @@
 
   io.deq <> buffer.io.deq
   memoryQueue.io.enq.bits := io.enq.bits
-  memoryQueue.io.enq.valid := io.enq.fire() && (!buffer.io.enq.ready || memoryQueueHasValues)
+  memoryQueue.io.enq.valid := io.enq.fire && (!buffer.io.enq.ready || memoryQueueHasValues)
   memoryQueue.io.deq.ready := buffer.io.enq.ready
 
   val count = Wire(UInt(log2Up(entries + 1).W))
   val countNext = RegEnable(
     next = count,
     init = 0.U,
-    enable = io.enq.fire() || io.deq.fire())
-  when (io.enq.fire() && !io.deq.fire()) {
+    enable = io.enq.fire || io.deq.fire)
+  when (io.enq.fire && !io.deq.fire) {
     assert(countNext < entries.U)
     count := countNext + 1.U
-  }.elsewhen (!io.enq.fire() && io.deq.fire()) {
+  }.elsewhen (!io.enq.fire && io.deq.fire) {
     assert(countNext > 0.U)
     count := countNext - 1.U
   }.otherwise {
@@ -226,24 +226,24 @@
   enqRR := RegEnable(
     next = ~enqRR,
     init = 1.U,
-    enable = io.enq.fire())
+    enable = io.enq.fire)
   val deqRR = Wire(Bool())
   deqRR := RegEnable(
     next = ~deqRR,
     init = 1.U,
-    enable = io.deq.fire())
+    enable = io.deq.fire)
 
 
-  val do_enq0 = WireInit(io.enq.fire() && enqRR)
-  val do_enq1 = WireInit(io.enq.fire() && ~enqRR)
-  val deq0 = WireInit(io.deq.fire() && deqRR)
-  val deq1 = WireInit(io.deq.fire() && ~deqRR)
+  val do_enq0 = WireInit(io.enq.fire && enqRR)
+  val do_enq1 = WireInit(io.enq.fire && ~enqRR)
+  val deq0 = WireInit(io.deq.fire && deqRR)
+  val deq1 = WireInit(io.deq.fire && ~deqRR)
   val do_deq0_next = RegNext(deq0 && do_enq0)
   val do_deq1_next = RegNext(deq1 && do_enq1)
   val do_deq0 = (deq0 && ~do_enq0) || do_deq0_next
   val do_deq1 = (deq1 && ~do_enq1) || do_deq1_next
 
-  val do_deq = WireInit(io.deq.fire())
+  val do_deq = WireInit(io.deq.fire)
   val full  = !queue0.io.enq.ready && !queue1.io.enq.ready
   val empty = !queue0.io.deq.valid && !queue1.io.deq.valid
 
@@ -301,8 +301,8 @@
   val empty = ptr_match && !maybe_full
   val full = ptr_match && maybe_full
 
-  val do_enq = WireInit(io.enq.fire())
-  val do_deq = WireInit(io.deq.fire())
+  val do_enq = WireInit(io.enq.fire)
+  val do_deq = WireInit(io.deq.fire)
 
   // check protocol
   val enq_next = RegNext(do_enq)
@@ -325,7 +325,7 @@
     enq_ptr.inc()
   }
 
-  val memAddr = Wire(enq_ptr.value.cloneType)
+  val memAddr = Wire(chiselTypeOf(enq_ptr.value))
   memAddr := enq_ptr.value
   when(!do_enq) {
     when(firstRead) {// output the 1st written data
@@ -384,8 +384,8 @@
   val empty = ptr_match && !maybe_full
   val full = ptr_match && maybe_full
 
-  val do_enq = WireInit(io.enq.fire())
-  val do_deq = WireInit(io.deq.fire())
+  val do_enq = WireInit(io.enq.fire)
+  val do_deq = WireInit(io.deq.fire)
 
 
   when(do_deq) {
@@ -405,7 +405,7 @@
   io.enq.ready := !full
   assert(!firstRead || !do_deq, "-F- Cannot have deq with first read as queue output is not valid yet")
 
-  val rdAddr = Wire(enq_ptr.value.cloneType)
+  val rdAddr = Wire(chiselTypeOf(enq_ptr.value))
   when(firstRead) {// output the 1st written data
     rdAddr := deq_ptr.value
   }.elsewhen (do_deq) {
@@ -446,11 +446,10 @@
 class MemIO[T <: Data](gen: T, entries: Int) extends Bundle
 {
   val wr_en   = Input(Bool())
-  val wr_data = Input(gen.cloneType)
+  val wr_data = Input(gen)
   val ch_en   = Input(Bool())
-  val rd_data = Output(gen.cloneType)
+  val rd_data = Output(gen)
   val addr    = Input(UInt(16.W)) // i dont care
-  override def cloneType: this.type = new MemIO(gen, entries).asInstanceOf[this.type]
 }
 class OnePortMem[T <: Data](
     gen: T,
@@ -478,11 +477,10 @@
 {
   val wr_en   = Input(Bool())
   val wr_addr = Input(UInt(16.W)) // i dont care
-  val wr_data = Input(gen.cloneType)
+  val wr_data = Input(gen)
   val rd_en   = Input(Bool())
   val rd_addr = Input(UInt(16.W)) // i dont care
-  val rd_data = Output(gen.cloneType)
-  override def cloneType: this.type = new MemIO2P(gen, entries).asInstanceOf[this.type]
+  val rd_data = Output(gen)
 }
 
 class TwoPortMem[T <: Data](
diff --git a/hardware/chisel/src/test/scala/unittest/AluTest.scala b/hardware/chisel/src/test/scala/unittest/AluTest.scala
index 7c2cb88..831bb45 100644
--- a/hardware/chisel/src/test/scala/unittest/AluTest.scala
+++ b/hardware/chisel/src/test/scala/unittest/AluTest.scala
@@ -20,7 +20,7 @@
 package unittest
 
 import chisel3.util._
-import chisel3.iotesters.PeekPokeTester
+import chiseltest.iotesters._
 import scala.util.Random
 import unittest.util._
 import vta.core._
diff --git a/hardware/chisel/src/test/scala/unittest/GemmTest.scala b/hardware/chisel/src/test/scala/unittest/GemmTest.scala
index f548389..3579124 100644
--- a/hardware/chisel/src/test/scala/unittest/GemmTest.scala
+++ b/hardware/chisel/src/test/scala/unittest/GemmTest.scala
@@ -19,9 +19,7 @@
 
 package unittest
 
-import chisel3._
-import chisel3.util._
-import chisel3.iotesters.PeekPokeTester
+import chiseltest.iotesters._
 import vta.core._
 import vta.util.config._
 
diff --git a/hardware/chisel/src/test/scala/unittest/Generic.scala b/hardware/chisel/src/test/scala/unittest/Generic.scala
index 3dc0b34..9afe754 100644
--- a/hardware/chisel/src/test/scala/unittest/Generic.scala
+++ b/hardware/chisel/src/test/scala/unittest/Generic.scala
@@ -20,27 +20,21 @@
 package unittest
 
 import chisel3._
-import chisel3.util._
 import vta.util.config._
-import chisel3.iotesters._
-import vta.{DefaultPynqConfig}
+import chiseltest._
+import chiseltest.iotesters._
+import org.scalatest.flatspec.AnyFlatSpec
+import vta.DefaultPynqConfig
 
-import org.scalatest.{Matchers, FlatSpec}
-
-class GenericTest[T <: Module, P <: PeekPokeTester[T], C <: Parameters]
-  (tag : String, dutFactory : (Parameters) => T, testerFactory : (T) => P) extends FlatSpec with Matchers {
+class GenericTest[T <: Module, P <: PeekPokeTester[T], C <: Parameters](
+    tag : String, dutFactory : (Parameters) => T, testerFactory : (T) => P
+  ) extends AnyFlatSpec with ChiselScalatestTester {
 
   implicit val p: Parameters = new DefaultPynqConfig
-
-  val arguments = Array(
-    "--backend-name", "treadle",
-    // "--backend-name", "vcs",
-    // "--is-verbose",
-    "--test-seed", "0"
-    )
+  val defaultOpts = Seq(TreadleBackendAnnotation)
 
   behavior of tag
   it should "not have expect violations" in {
-    chisel3.iotesters.Driver.execute(arguments, ()=> dutFactory(p))(testerFactory) should be (true)
+    test(dutFactory(p)).withAnnotations(defaultOpts).runPeekPoke(testerFactory)
   }
 }
diff --git a/hardware/chisel/src/test/scala/unittest/Launcher.scala b/hardware/chisel/src/test/scala/unittest/Launcher.scala
deleted file mode 100644
index 1b0d6da..0000000
--- a/hardware/chisel/src/test/scala/unittest/Launcher.scala
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package unittest
-// taken from https://github.com/freechipsproject/chisel-testers
-
-import chisel3._
-import chisel3.iotesters.{Driver, TesterOptionsManager}
-import unittest.util._
-import vta.core._
-import vta.util.config._
-import vta.shell._
-
-class TestConfig extends Config(new CoreConfig ++ new PynqConfig)
-
-/* Launcher.
- *
- * The Launcher object includes a test list for the TestRunner to check.
- * Users can utilize this Launcher to run custom tests.
- *
- * How to Use:
- * When the user input: sbt 'test:runMain unittest.Launcher mvm'
- * the TestRunner will look for 'mvm' in the map and executes the
- * test that 'mvm' is mapped to
- */
-object Launcher {
-  implicit val p: Parameters = new TestConfig
-  val tests = Map(
-    "mvm" -> { (manager: TesterOptionsManager) =>
-      Driver.execute(() => new MatrixVectorMultiplication, manager) {
-        (c) => new TestMatrixVectorMultiplication(c)
-      }
-    },
-    "alu" -> { (manager: TesterOptionsManager) =>
-      Driver.execute(() => new AluVector, manager) {
-        (c) => new AluVectorTester(c)
-      }
-    }
-  )
-
-  def main(args: Array[String]): Unit = {
-    TestRunner(tests, args)
-  }
-}
diff --git a/hardware/chisel/src/test/scala/unittest/MvmTest.scala b/hardware/chisel/src/test/scala/unittest/MvmTest.scala
index bd4e10c..c2129ea 100644
--- a/hardware/chisel/src/test/scala/unittest/MvmTest.scala
+++ b/hardware/chisel/src/test/scala/unittest/MvmTest.scala
@@ -21,7 +21,8 @@
 
 import chisel3._
 import chisel3.util._
-import chisel3.iotesters.{ChiselFlatSpec, Driver, PeekPokeTester}
+import chiseltest._
+import chiseltest.iotesters._
 import scala.math.pow
 import unittest.util._
 import vta.core._
diff --git a/hardware/chisel/src/test/scala/unittest/SyncQueue2PortMemTest.scala b/hardware/chisel/src/test/scala/unittest/SyncQueue2PortMemTest.scala
index c8916ad..d26a1ba 100644
--- a/hardware/chisel/src/test/scala/unittest/SyncQueue2PortMemTest.scala
+++ b/hardware/chisel/src/test/scala/unittest/SyncQueue2PortMemTest.scala
@@ -21,9 +21,7 @@
 
 import chisel3._
 import chisel3.util._
-import chisel3.iotesters.{ChiselFlatSpec, Driver, PeekPokeTester}
-import scala.util.Random
-import unittest.util._
+import chiseltest.iotesters._
 import vta.util._
 import vta.util.config._
 
diff --git a/hardware/chisel/src/test/scala/unittest/SyncQueueTest.scala b/hardware/chisel/src/test/scala/unittest/SyncQueueTest.scala
index 105d0df..b001cf0 100644
--- a/hardware/chisel/src/test/scala/unittest/SyncQueueTest.scala
+++ b/hardware/chisel/src/test/scala/unittest/SyncQueueTest.scala
@@ -21,7 +21,8 @@
 
 import chisel3._
 import chisel3.util._
-import chisel3.iotesters.{ChiselFlatSpec, Driver, PeekPokeTester}
+import chiseltest._
+import chiseltest.iotesters._
 import scala.util.Random
 import unittest.util._
 import vta.util._
diff --git a/hardware/chisel/src/test/scala/unittest/TensorAluTest.scala b/hardware/chisel/src/test/scala/unittest/TensorAluTest.scala
index 21da5f0..bcc16f6 100644
--- a/hardware/chisel/src/test/scala/unittest/TensorAluTest.scala
+++ b/hardware/chisel/src/test/scala/unittest/TensorAluTest.scala
@@ -21,7 +21,8 @@
 
 import chisel3._
 import chisel3.util._
-import chisel3.iotesters.{ChiselFlatSpec, Driver, PeekPokeTester}
+import chiseltest._
+import chiseltest.iotesters._
 import scala.util.Random
 import unittest.util._
 import vta.core._
diff --git a/hardware/chisel/src/test/scala/unittest/TensorGemmJsonTest.scala b/hardware/chisel/src/test/scala/unittest/TensorGemmJsonTest.scala
index 1e4f153..5847474 100644
--- a/hardware/chisel/src/test/scala/unittest/TensorGemmJsonTest.scala
+++ b/hardware/chisel/src/test/scala/unittest/TensorGemmJsonTest.scala
@@ -21,7 +21,8 @@
 
 import chisel3._
 import chisel3.util._
-import chisel3.iotesters.PeekPokeTester
+import chiseltest._
+import chiseltest.iotesters._
 import unittest.util._
 import vta.core._
 import vta.util.config._
diff --git a/hardware/chisel/src/test/scala/unittest/TensorGemmTest.scala b/hardware/chisel/src/test/scala/unittest/TensorGemmTest.scala
index 6b2234c..e0eac0e 100644
--- a/hardware/chisel/src/test/scala/unittest/TensorGemmTest.scala
+++ b/hardware/chisel/src/test/scala/unittest/TensorGemmTest.scala
@@ -21,7 +21,8 @@
 
 import chisel3._
 import chisel3.util._
-import chisel3.iotesters.PeekPokeTester
+import chiseltest._
+import chiseltest.iotesters._
 import unittest.util._
 import vta.core._
 import vta.util.config._
diff --git a/hardware/chisel/src/test/scala/unittest/utils/TestRunner.scala b/hardware/chisel/src/test/scala/unittest/utils/TestRunner.scala
deleted file mode 100644
index 789eeb9..0000000
--- a/hardware/chisel/src/test/scala/unittest/utils/TestRunner.scala
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package unittest.util
-// taken from https://github.com/freechipsproject/chisel-testers
-
-import scala.collection.mutable.ArrayBuffer
-import chisel3.iotesters._
-
-object TestRunner {
-
-  def apply(testMap: Map[String, TesterOptionsManager => Boolean], args: Array[String]): Unit = {
-    var successful = 0
-    val errors = new ArrayBuffer[String]
-
-    val optionsManager = new TesterOptionsManager()
-    optionsManager.doNotExitOnHelp()
-
-    optionsManager.parse(args)
-
-    val programArgs = optionsManager.commonOptions.programArgs
-
-    if(programArgs.isEmpty) {
-      println("Available tests")
-      for(x <- testMap.keys) {
-        println(x)
-      }
-      println("all")
-      System.exit(0)
-    }
-
-    val testsToRun = if(programArgs.exists(x => x.toLowerCase() == "all")) {
-      testMap.keys
-    }
-    else {
-      programArgs
-    }
-
-    for(testName <- testsToRun) {
-      testMap.get(testName) match {
-        case Some(test) =>
-          println(s"Starting $testName")
-          try {
-            optionsManager.setTopName(testName)
-            optionsManager.setTargetDirName(s"test_run_dir/$testName")
-            if(test(optionsManager)) {
-              successful += 1
-            }
-            else {
-              errors += s"$testName: test error occurred"
-            }
-          }
-          catch {
-            case exception: Exception =>
-              exception.printStackTrace()
-              errors += s"$testName: exception ${exception.getMessage}"
-            case t : Throwable =>
-              errors += s"$testName: throwable ${t.getMessage}"
-          }
-        case _ =>
-          errors += s"Bad Test name: $testName"
-      }
-
-    }
-    if(successful > 0) {
-      println(s"Tests passing: $successful")
-    }
-    if(errors.nonEmpty) {
-      println("=" * 80)
-      println(s"Errors: ${errors.length}: in the following tests")
-      println(errors.mkString("\n"))
-      println("=" * 80)
-    }
-  }
-}