Chisel Pipelined GEMM (#30)
* Reset to 644 file permissions
* Add json files to src/test/resources for testing
* Add new TensorGemmPipelinedSplit module and rename existing TensorGemm to TensorGemmOrig
* Tests for TensorGemmPipelinedSplit, TensorGemmOrig, and associated submodules
* Add jackson plugin dependency and stricter Scala checks
* Remove debug prints
* Rename x.json and y.json to gemm_1uop_overflow_offset.json and gemm_2uop_overflow_cascaded.json respectively
* All occurrences of '\( ' replaced with '\('
* Add linting rule to flag spaces after lparen characters
* Remove comment
* Rename TensorGemmOrig to TensorGemmSimple
diff --git a/hardware/chisel/build.sbt b/hardware/chisel/build.sbt
index 7efd59d..851f5ab 100644
--- a/hardware/chisel/build.sbt
+++ b/hardware/chisel/build.sbt
@@ -68,5 +68,13 @@
libraryDependencies ++= Seq("chisel3","chisel-iotesters").map {
dep: String => "edu.berkeley.cs" %% dep % sys.props.getOrElse(dep + "Version", defaultVersions(dep)) }
+libraryDependencies ++= Seq(
+ "com.fasterxml.jackson.core" % "jackson-databind" % "2.10.3",
+ "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.10.3"
+)
+
+scalacOptions += "-language:reflectiveCalls"
+scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature", "-Xfatal-warnings")
+
scalacOptions ++= scalacOptionsVersion(scalaVersion.value)
javacOptions ++= javacOptionsVersion(scalaVersion.value)
diff --git a/hardware/chisel/scalastyle-config.xml b/hardware/chisel/scalastyle-config.xml
index 1252900..89196be 100644
--- a/hardware/chisel/scalastyle-config.xml
+++ b/hardware/chisel/scalastyle-config.xml
@@ -71,6 +71,11 @@
</check>
<check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.scalariform.DisallowSpaceAfterTokenChecker" enabled="true">
+ <parameters>
+ <parameter name="tokens">LPAREN</parameter>
+ </parameters>
+ </check>
<check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="true"></check>
diff --git a/hardware/chisel/src/main/scala/core/TensorGemm.scala b/hardware/chisel/src/main/scala/core/TensorGemm.scala
index e977552..f63de94 100644
--- a/hardware/chisel/src/main/scala/core/TensorGemm.scala
+++ b/hardware/chisel/src/main/scala/core/TensorGemm.scala
@@ -21,12 +21,11 @@
import chisel3._
import chisel3.util._
-import chisel3.experimental._
import vta.util.config._
import scala.math.pow
/** Pipelined multiply and accumulate */
-class MAC(aBits: Int = 8, bBits: Int = 8, cBits: Int = 16) extends Module {
+class MAC(aBits: Int = 8, bBits: Int = 8, cBits: Int = 16, flopIn: Boolean = false) extends Module {
val outBits = Math.max(aBits + bBits, cBits) + 1
val io = IO(new Bundle {
val a = Input(SInt(aBits.W))
@@ -34,16 +33,15 @@
val c = Input(SInt(cBits.W))
val y = Output(SInt(outBits.W))
})
+
val mult = Wire(SInt((aBits + bBits).W))
- val add = Wire(SInt(outBits.W))
- val rA = RegNext(io.a)
- val rB = RegNext(io.b)
- val rC = RegNext(io.c)
+ val rA = if (flopIn) RegNext(io.a) else io.a
+ val rB = if (flopIn) RegNext(io.b) else io.b
+ val rC = if (flopIn) RegNext(io.c) else io.c
mult := rA * rB
- add := rC +& mult
-
- io.y := add
+ val addV = if (flopIn) {rC +& mult} else {RegNext(rC +& mult)}
+ io.y := addV
}
/** PipeAdder
@@ -86,28 +84,31 @@
}
/** Pipelined DotProduct based on MAC and PipeAdder */
-class DotProduct(aBits: Int = 8, bBits: Int = 8, size: Int = 16) extends Module {
+class DotProduct(aBits: Int = 8, bBits: Int = 8, blockIn: Int = 16) extends Module {
val errorMsg =
s"\n\n[VTA] [DotProduct] size must be greater than 4 and a power of 2\n\n"
- require(size >= 2 && isPow2(size), errorMsg)
+ require(blockIn >= 2 && isPow2(blockIn), errorMsg)
val b = aBits + bBits
- val outBits = b + log2Ceil(size) + 1
+ val outBits = b + log2Ceil(blockIn) + 1
val io = IO(new Bundle {
- val a = Input(Vec(size, SInt(aBits.W)))
- val b = Input(Vec(size, SInt(bBits.W)))
+ val a = Input(Vec(blockIn, SInt(aBits.W)))
+ val b = Input(Vec(blockIn, SInt(bBits.W)))
val y = Output(SInt(outBits.W))
})
- val s = Seq.tabulate(log2Ceil(size + 1))(i =>
- pow(2, log2Ceil(size) - i).toInt) // # of total layers
- val p = log2Ceil(size / 2) + 1 // # of adder layers
- val m = Seq.fill(s(0))(Module(new MAC(aBits, bBits, cBits = 1))) // # of total vector pairs
+ val s = Seq.tabulate(log2Ceil(blockIn + 1))(i =>
+ pow(2, log2Ceil(blockIn) - i).toInt) // # of total layers
+ val p = log2Ceil(blockIn / 2) + 1 // # of adder layers
+ val m = Seq.fill(s(0))(Module(new MAC(aBits, bBits, cBits = 1, flopIn = p < 6))) // # of total vector pairs
val a = Seq.tabulate(p)(
i =>
Seq.fill(s(i + 1))(
- if (i == 0)
+ if ((i == 0 && p < 4) || (i == p - 2 && p >= 4)) {
Module(new PipeAdder(aBits = (b + i + 1), bBits = (b + i + 1)))
- else
- Module(new Adder(aBits = (b + i + 1), bBits = (b + i + 1))))) // # adders within each layer
+ }
+ else {
+ Module(new Adder(aBits = (b + i + 1), bBits = (b + i + 1)))
+ }
+ )) // # adders within each layer
// Vector MACs
for (i <- 0 until s(0)) {
@@ -137,7 +138,8 @@
/** Perform matrix-vector-multiplication based on DotProduct */
class MatrixVectorMultiplication(implicit p: Parameters) extends Module {
val accBits = p(CoreKey).accBits
- val size = p(CoreKey).blockOut
+ val size = p(CoreKey).blockOut / p(CoreKey).blockOutFactor
+ val batch = p(CoreKey).batch
val inpBits = p(CoreKey).inpBits
val wgtBits = p(CoreKey).wgtBits
val outBits = p(CoreKey).outBits
@@ -149,28 +151,149 @@
val acc_o = new TensorClientData(tensorType = "acc")
val out = new TensorClientData(tensorType = "out")
})
- val dot = Seq.fill(size)(
- Module(new DotProduct(aBits = inpBits, bBits = wgtBits, size)))
+ val dot = Seq.fill(batch)(Seq.fill(size)(
+ Module(new DotProduct(aBits = inpBits, bBits = wgtBits, size))))
// Latency is defined as two in the following, because there is one cycle in the MAC module,
// and another cycle in the pipelined adders as the first layer of the accumulator
- val acc = Seq.fill(size)(Module(new Pipe(UInt(accBits.W), latency = 2)))
- val add = Seq.fill(size)(Wire(SInt(accBits.W)))
- val vld = Wire(Vec(size, Bool()))
+ val acc = Seq.fill(batch)(Seq.fill(size)(Module(new Pipe(UInt(accBits.W), latency = 2))))
+ val add = Seq.fill(batch)(Seq.fill(size)(Wire(SInt(accBits.W))))
+ val vld = Wire(Vec(batch, Vec(size, Bool())))
- for (i <- 0 until size) {
- acc(i).io.enq.valid := io.inp.data.valid & io.wgt.data.valid & io.acc_i.data.valid & ~io.reset
- acc(i).io.enq.bits := io.acc_i.data.bits(0)(i)
- for (j <- 0 until size) {
- dot(i).io.a(j) := io.inp.data.bits(0)(j).asSInt
- dot(i).io.b(j) := io.wgt.data.bits(i)(j).asSInt
+ for (b <- 0 until batch) {
+ for (i <- 0 until size) {
+ acc(b)(i).io.enq.valid := io.inp.data.valid & io.wgt.data.valid & io.acc_i.data.valid & ~io.reset
+ acc(b)(i).io.enq.bits := io.acc_i.data.bits(b)(i)
+ for (j <- 0 until size) {
+ dot(b)(i).io.a(j) := io.inp.data.bits(b)(j).asSInt
+ dot(b)(i).io.b(j) := io.wgt.data.bits(i)(j).asSInt // all batches get the same weight - reuse
+ }
+ add(b)(i) := acc(b)(i).io.deq.bits.asSInt + dot(b)(i).io.y
+ io.acc_o.data.bits(b)(i) := Mux(io.reset, 0.U, add(b)(i).asUInt)
+ io.out.data.bits(b)(i) := add(b)(i).asUInt
+ vld(b)(i) := acc(b)(i).io.deq.valid
}
- add(i) := acc(i).io.deq.bits.asSInt + dot(i).io.y
- io.acc_o.data.bits(0)(i) := Mux(io.reset, 0.U, add(i).asUInt)
- io.out.data.bits(0)(i) := add(i).asUInt
- vld(i) := acc(i).io.deq.valid
+ io.acc_o.data.valid := vld.asUInt.andR | io.reset
+ io.out.data.valid := vld.asUInt.andR
}
- io.acc_o.data.valid := vld.asUInt.andR | io.reset
- io.out.data.valid := vld.asUInt.andR
+}
+
+/** Perform matrix-vector-multiplication based on DotProduct */
+class MatrixVectorMultiplicationBypass(implicit p: Parameters) extends Module {
+ val accBits = p(CoreKey).accBits
+ val blockOut = p(CoreKey).blockOut / p(CoreKey).blockOutFactor
+ val blockIn = p(CoreKey).blockIn
+ val batch = p(CoreKey).batch
+ val inpBits = p(CoreKey).inpBits
+ val wgtBits = p(CoreKey).wgtBits
+ val outBits = p(CoreKey).outBits
+ val io = IO(new Bundle {
+ val valid_reset = Input(Bool())
+ val inp = new TensorMasterData(tensorType = "inp")
+ val wgt = new TensorMasterData(tensorType = "wgt")
+ val acc_i = new TensorMasterData(tensorType = "acc")
+ val acc_o = new TensorClientData(tensorType = "acc")
+ val out = new TensorClientData(tensorType = "out")
+ val bypass_cond = Input(Bool())
+ })
+ val dot = Seq.fill(batch)(Seq.fill(blockOut)(
+ Module(new DotProduct(aBits = inpBits, bBits = wgtBits, blockIn))))
+ val add = Seq.fill(batch)(Seq.fill(blockOut)(Wire(SInt(accBits.W))))
+ val last_acc_write = Seq.fill(batch)(Seq.fill(blockOut){Reg(SInt(accBits.W))})
+ io.out.data.bits := DontCare // out is not fully initialized by a single module
+ for (b <- 0 until batch) {
+ for (i <- 0 until blockOut) {
+ for (j <- 0 until blockIn) {
+ dot(b)(i).io.a(j) := io.inp.data.bits(b)(j).asSInt
+ dot(b)(i).io.b(j) := io.wgt.data.bits(i)(j).asSInt
+ }
+ val byp = Mux(io.bypass_cond, last_acc_write(b)(i), io.acc_i.data.bits(b)(i).asSInt)
+ add(b)(i) := byp + dot(b)(i).io.y
+ val tmp = Mux(io.valid_reset, 0.S, add(b)(i))
+ io.acc_o.data.bits(b)(i) := tmp.asUInt
+ last_acc_write(b)(i) := tmp
+ io.out.data.bits(b)(i) := add(b)(i).asUInt
+ }
+ }
+ io.acc_o.data.valid := io.acc_i.data.valid | io.valid_reset
+ io.out.data.valid := io.acc_i.data.valid & ~io.valid_reset
+}
+
+class TensorGemmIndexGenerator(implicit p: Parameters) extends Module {
+ val io = IO(new Bundle {
+ val start = Input(Bool())
+ val last = Output(Bool())
+
+ val dec = Input(new GemmDecode)
+
+ val acc_i = Output(UInt(new TensorParams(tensorType="acc").memAddrBits.W))
+ val inp_i = Output(UInt(new TensorParams(tensorType="inp").memAddrBits.W))
+ val wgt_i = Output(UInt(new TensorParams(tensorType="wgt").memAddrBits.W))
+
+ val uop_idx = Output(UInt(log2Ceil(p(CoreKey).uopMemDepth).W))
+
+ val valid = Output(Bool())
+ })
+
+ io.last := false.B
+
+ val running = RegInit(false.B)
+ when(!running && io.start) {
+ running := true.B
+ }.elsewhen(io.last) {
+ running := false.B
+ }
+
+ val cnt_i = Reg(chiselTypeOf(io.dec.lp_1))
+ val acc_i = Reg(chiselTypeOf(io.acc_i))
+ val inp_i = Reg(chiselTypeOf(io.inp_i))
+ val wgt_i = Reg(chiselTypeOf(io.wgt_i))
+
+ val cnt_o = Reg(chiselTypeOf(io.dec.lp_0))
+ val acc_o = Reg(chiselTypeOf(io.acc_i))
+ val inp_o = Reg(chiselTypeOf(io.inp_i))
+ val wgt_o = Reg(chiselTypeOf(io.wgt_i))
+
+ val uop_idx = Reg(chiselTypeOf(io.dec.uop_end))
+
+ io.valid := running
+ io.acc_i := acc_i
+ io.inp_i := inp_i
+ io.wgt_i := wgt_i
+ io.uop_idx := uop_idx
+
+ when(!running) {
+ cnt_i := 0.U; acc_i := 0.U; inp_i := 0.U; wgt_i := 0.U
+ cnt_o := 0.U; acc_o := 0.U; inp_o := 0.U; wgt_o := 0.U
+ uop_idx := io.dec.uop_begin
+ } .otherwise {
+ when (uop_idx =/= io.dec.uop_end - 1.U) {
+ uop_idx := uop_idx + 1.U
+ }.otherwise {
+ uop_idx := io.dec.uop_begin
+ when (cnt_i =/= io.dec.lp_1 - 1.U) {
+ cnt_i := cnt_i + 1.U
+ acc_i := acc_i + io.dec.acc_1
+ inp_i := inp_i + io.dec.inp_1
+ wgt_i := wgt_i + io.dec.wgt_1
+ }.otherwise {
+ when (cnt_o =/= io.dec.lp_0 - 1.U) {
+ val acc_tmp = acc_o + io.dec.acc_0
+ val inp_tmp = inp_o + io.dec.inp_0
+ val wgt_tmp = wgt_o + io.dec.wgt_0
+ cnt_o := cnt_o + 1.U
+ acc_o := acc_tmp
+ inp_o := inp_tmp
+ wgt_o := wgt_tmp
+ cnt_i := 0.U
+ acc_i := acc_tmp
+ inp_i := inp_tmp
+ wgt_i := wgt_tmp
+ } .otherwise {
+ io.last := true.B
+ }
+ }
+ }
+ }
}
abstract class TensorGemmIfc(implicit p: Parameters) extends Module {
@@ -190,16 +313,16 @@
})
}
-/** TensorGemm.
+/** TensorGemmSimple
*
* This unit instantiate the MatrixVectorMultiplication and go over the
* micro-ops (uops) which are used to read inputs, weights and biases,
* and writes results back to the acc and out scratchpads.
*
- * Also, the TensorGemm uses the reset field in the Gemm instruction to
+ * Also, TensorGemmSimple uses the reset field in the Gemm instruction to
* clear or zero-out the acc-scratchpad locations based on the micro-ops.
*/
-class TensorGemm(debug: Boolean = false)(implicit p: Parameters) extends TensorGemmIfc {
+class TensorGemmSimple(debug: Boolean = false)(implicit p: Parameters) extends TensorGemmIfc {
require(p(CoreKey).blockOutFactor == 1,
"-F- Split GEMM not supported. Use TensorGemmPipelinedSplit or set blockOutFactor to 1")
@@ -227,12 +350,12 @@
// Latency is defined as two in the following, because there is one cycle in the MAC module,
// and another cycle in the pipelined adders as the first layer of the accumulator
val wrpipe = Module(new Pipe(chiselTypeOf(dec.uop_end), latency = 2))
- val cond_last = cnt_o === dec.lp_0 - 1.U &
+ val cond = cnt_o === dec.lp_0 - 1.U &
cnt_i === dec.lp_1 - 1.U &
uop_idx === uop_end - 1.U
val done = inflight === 0.U &
- ((state === sExe) & cond_last | state === sWait)
+ ((state === sExe) & cond | state === sWait)
switch(state) {
is(sIdle) {
@@ -250,7 +373,7 @@
state := sExe
}
is(sExe) {
- when(cond_last) {
+ when(cond) {
when(inflight =/= 0.U) {
state := sWait
}.otherwise {
@@ -421,3 +544,206 @@
}
}
}
+
+class TensorGemmPipelinedSplit (implicit p: Parameters) extends TensorGemmIfc {
+ val sIdle::sRun::sWait::Nil = Enum(3);
+ val numMVMs = p(CoreKey).blockOutFactor
+ val numOuts = p(CoreKey).blockOut / numMVMs
+ require (numOuts > 0, "-F- Cannot factor more groups than blockOut")
+ val batch = p(CoreKey).batch
+
+ val m = Module(new TensorGemmIndexGenerator)
+
+ // additional pipe latency of wgt/inp read if needed
+ val scratchpadReadLatency = 0
+ val inpReadIdxLatency = 0
+ val uopReadLatency = 0
+
+ val delayed_valid = ShiftRegister(m.io.valid, uopReadLatency + 1, resetData = false.B, en = true.B)
+ val delayed_acc_i = ShiftRegister(m.io.acc_i, uopReadLatency + 1)
+ val delayed_inp_i = ShiftRegister(m.io.inp_i, uopReadLatency + 1)
+ val delayed_wgt_i = ShiftRegister(m.io.wgt_i, uopReadLatency + 1)
+
+ val state = RegInit(sIdle)
+ val inflight = RegInit(0.U(inflightBits.W))
+
+ val capture_dec = Reg(chiselTypeOf(io.dec))
+
+ io.done := false.B
+ when(state === sIdle && io.start) {
+ state := sRun
+ capture_dec := io.dec
+ // if (io.dec.empty_0 != None) assert(io.dec.empty_0.get === 0.U)
+ // if (io.dec.empty_1 != None) assert(io.dec.empty_1.get === 0.U)
+ }.elsewhen(state === sRun && m.io.last) {
+ state := sWait
+ }.elsewhen(state === sWait && inflight === 0.U) {
+ state := sIdle
+ io.done := true.B
+ }
+ io.state := state
+
+ assert(state =/= sRun || capture_dec.asUInt === io.dec.asUInt)
+ assert(state =/= sWait || capture_dec.asUInt === io.dec.asUInt)
+
+ m.io.start := io.start
+
+ m.io.dec := io.dec
+ io.uop.idx.bits := m.io.uop_idx
+ io.uop.idx.valid := m.io.valid
+
+ val delayedUopData = ShiftRegister(io.uop.data, uopReadLatency)
+
+ assert(delayedUopData.valid === delayed_valid)
+
+ val uop_valid = ShiftRegister(delayed_valid, inpReadIdxLatency, resetData = false.B, en = true.B)
+ val uop_acc = ShiftRegister(delayedUopData.bits.u0 + delayed_acc_i, inpReadIdxLatency)
+ val uop_inp = delayedUopData.bits.u1 + delayed_inp_i // it is piped in inp tensor read
+ val uop_wgt = ShiftRegister(delayedUopData.bits.u2 + delayed_wgt_i, inpReadIdxLatency)
+
+ val reset_pipe = Module(
+ new Pipe(
+ Bool(),
+ latency = 3 /* 1 stage is borrowed down here*/ + scratchpadReadLatency + inpReadIdxLatency + uopReadLatency))
+ reset_pipe.io.enq.valid := m.io.valid
+ reset_pipe.io.enq.bits := capture_dec.reset
+
+ val acc_idx_pipe = Module(
+ new Pipe(chiselTypeOf(io.acc.rd(0).idx.bits), latency= 1 /* borrow 1 stage to split*/ + scratchpadReadLatency))
+ acc_idx_pipe.io.enq.valid := uop_valid
+ acc_idx_pipe.io.enq.bits := uop_acc
+
+ require(io.inp.splitWidth == 1 && io.inp.splitLength == 1, "-F- Input split read not supported")
+ io.inp.rd(0).idx.valid := delayed_valid
+ io.inp.rd(0).idx.bits := uop_inp
+ val delayed_uop_valid = RegNext(uop_valid, init=false.B) // memdelay
+ // asset fires on emulated tensorRead Direct GEMM test TODO: fix memoryManager sram read
+ // it works only for VTA_CORE_GEMM_INP_IDX_PIPE 0
+ assert(io.inp.rd(0).data.valid === delayed_uop_valid)
+ for (idx <- 0 until numMVMs) {
+ io.acc.rd(idx).idx.valid := RegNext(acc_idx_pipe.io.deq.valid, init = false.B)
+ io.acc.rd(idx).idx.bits := RegNext(acc_idx_pipe.io.deq.bits)
+
+ // delay wgt read by input result delay latency
+ io.wgt.rd(idx).idx.valid := ShiftRegister(uop_valid, scratchpadReadLatency)
+ io.wgt.rd(idx).idx.bits := ShiftRegister(uop_wgt, scratchpadReadLatency)
+
+ assert(io.wgt.rd(idx).data.valid === ShiftRegister(delayed_uop_valid, scratchpadReadLatency))
+ }
+ io.wgt.tieoffWrite()
+ io.inp.tieoffWrite()
+
+ // create a pipe of 3+ delay with split by goup last stage
+ // and a separate last stage for out and inflight
+ val wrpipe0 = Module(new Pipe(chiselTypeOf(io.acc.wr(0).bits.idx), latency= 2 + scratchpadReadLatency))
+ wrpipe0.io.enq.valid := uop_valid
+ wrpipe0.io.enq.bits := uop_acc
+ // write pipe not split
+ val wrpipeNs = Module(new Pipe(chiselTypeOf(io.acc.wr(0).bits.idx), latency= 1))
+ wrpipeNs.io.enq <> wrpipe0.io.deq
+ // split the last pipe stage per group
+ val wrpipe = for (idx <- 0 until numMVMs) yield {
+ val pipe = Module(new Pipe(chiselTypeOf(io.acc.wr(0).bits.idx), latency= 1))
+ pipe.io.enq <> wrpipe0.io.deq
+ pipe
+ }
+
+ for (idx <- 0 until numMVMs) {
+ assert(io.acc.rd(idx).data.valid === wrpipe(idx).io.deq.valid)
+ }
+
+ when(m.io.valid && wrpipeNs.io.deq.valid) {
+ }.elsewhen(m.io.valid) {
+ assert(inflight =/= ((1<<inflightBits)-1).U)
+ inflight := inflight + 1.U
+ }.elsewhen(wrpipeNs.io.deq.valid) {
+ assert(inflight =/= 0.U)
+ inflight := inflight - 1.U
+ }
+ when(state === sIdle) {
+ assert(inflight === 0.U)
+ inflight := 0.U
+ }
+
+ io.inflight := inflight
+
+ val mvmInpRdLatency = if (scratchpadReadLatency == 0) {
+ 0
+ } else {
+ scratchpadReadLatency - 1
+ }
+ // split factor of inp data for many groups
+ val splitFactorL0 = pow(2,log2Ceil(numMVMs) / 2).toInt
+ val splitFactorL1 = pow(2,log2Ceil(numMVMs)
+ - log2Ceil(numMVMs) / 2).toInt
+ require(splitFactorL0 * splitFactorL1 == numMVMs)
+ val inpRdData0 = for (idx <- 0 until splitFactorL0) yield {
+ if (scratchpadReadLatency > 0) RegNext(io.inp.rd(0).data) else io.inp.rd(0).data
+ }
+
+ // define MVC groups operating on a subset of acc elements
+ // each MVM generates only a part of acc bits while has whole inteface defined !!!
+ // those bits are lower bits in acc/out interface
+ val mvc = for (idx <- 0 until numMVMs) yield {Module(new MatrixVectorMultiplicationBypass)}
+
+ require(io.out.splitWidth == 1 && io.out.splitLength == 1, "-F- Out split write is not supported")
+ for (idx1 <- 0 until numMVMs) {
+
+ val wrpipe2 = Module(new Pipe(chiselTypeOf(io.acc.wr(0).bits.idx), latency=1))
+ wrpipe2.io.enq := wrpipe(idx1).io.deq
+
+ mvc(idx1).io.bypass_cond :=
+ wrpipe(idx1).io.deq.bits === wrpipe2.io.deq.bits && wrpipe(idx1).io.deq.valid && wrpipe2.io.deq.valid
+
+ // borrow one stage from reset_pipe and split per group
+ mvc(idx1).io.valid_reset := RegNext(reset_pipe.io.deq.bits & reset_pipe.io.deq.valid, init = false.B)
+ // wire to each mvm
+ mvc(idx1).io.inp.data :=
+ ShiftRegister(inpRdData0(idx1/splitFactorL1), mvmInpRdLatency) // delay to deliver over distance
+ mvc(idx1).io.wgt.data := io.wgt.rd(idx1).data // wgt read idx is delayed instead of data
+ mvc(idx1).io.acc_i.data.valid := io.acc.rd(idx1).data.valid
+ assert(mvc(idx1).io.acc_o.data.valid === (wrpipe(idx1).io.deq.valid | mvc(idx1).io.valid_reset))
+ for(accLenIdx <- 0 until mvc(idx1).io.acc_o.lenSplit) {
+ for(accWdtIdx <- 0 until mvc(idx1).io.acc_o.widthSplit) {
+ val (gemmGrpIdx, gemmLenIdx, gemmWdtIdx) =
+ mvc(idx1).io.acc_o.reindexDataToGroup(idx1, accLenIdx, accWdtIdx)
+ mvc(gemmGrpIdx).io.acc_i.data.bits(gemmLenIdx)(gemmWdtIdx) :=
+ io.acc.rd(idx1).data.bits(accLenIdx)(accWdtIdx)
+ }
+ }
+
+ for(gemmLenIdx <- 0 until mvc(idx1).io.acc_o.lenSplit) {
+ for(gemmWdtIdx <- 0 until mvc(idx1).io.acc_o.widthSplit) {
+ val (accGrpIdx, accLenIdx, accWdtIdx) =
+ mvc(idx1).io.acc_o.reindexDataFromGroup(idx1, gemmLenIdx, gemmWdtIdx)
+ io.acc.wr(accGrpIdx).bits.data(accLenIdx)(accWdtIdx) :=
+ mvc(idx1).io.acc_o.data.bits(gemmLenIdx)(gemmWdtIdx)
+ }
+ }
+
+ io.acc.wr(idx1).valid := wrpipe(idx1).io.deq.valid
+ io.acc.wr(idx1).bits.idx := wrpipe(idx1).io.deq.bits
+ }
+// comment to split write out
+ if (numMVMs > 1) {
+ for (idx1 <- 1 until numMVMs) {
+ assert(mvc(idx1).io.out.data.valid === mvc(idx1 - 1).io.out.data.valid,
+ "-F- Out split write is not supported")
+ }
+ }
+ val outData = Wire(io.out.wr(0).bits.data.cloneType)
+ for (idx3 <- 0 until numMVMs) {
+ for (idx1 <- 0 until io.out.tensorLength) {
+ for (idx2 <- 0 until io.out.tensorWidth/numMVMs) {
+ outData(idx1)(idx3*io.out.tensorWidth/numMVMs + idx2) := mvc(idx3).io.out.data.bits(idx1)(idx2)
+ }
+ }
+ }
+ io.out.wr(0).bits.data := outData
+ io.out.wr(0).valid := wrpipeNs.io.deq.valid && mvc(io.acc.closestIOGrpIdx).io.out.data.valid
+ io.out.wr(0).bits.idx := wrpipeNs.io.deq.bits
+
+ io.out.tieoffRead()
+}
+
+class TensorGemm(implicit val p: Parameters) extends TensorGemmPipelinedSplit
diff --git a/hardware/chisel/src/test/resources/.gitignore b/hardware/chisel/src/test/resources/.gitignore
new file mode 100644
index 0000000..0521c5f
--- /dev/null
+++ b/hardware/chisel/src/test/resources/.gitignore
@@ -0,0 +1 @@
+!*.json
diff --git a/hardware/chisel/src/test/resources/gemm_1uop_overflow_offset.json b/hardware/chisel/src/test/resources/gemm_1uop_overflow_offset.json
new file mode 100644
index 0000000..05bc0b1
--- /dev/null
+++ b/hardware/chisel/src/test/resources/gemm_1uop_overflow_offset.json
@@ -0,0 +1,188 @@
+{
+ "inst": {
+ "reset": "0",
+ "uop_begin": "0001",
+ "uop_end": "0002",
+ "lp_0": "0001",
+ "lp_1": "0001",
+ "acc_0": "000",
+ "acc_1": "000",
+ "inp_0": "000",
+ "inp_1": "000",
+ "wgt_0": "000",
+ "wgt_1": "000"
+ },
+ "inp": [
+ {
+ "idx": "00000000",
+ "vec": [
+ "01",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00"
+ ]
+ },
+ {
+ "idx": "00000001",
+ "vec": [
+ "00",
+ "01",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00"
+ ]
+ }
+ ],
+ "wgt": [
+ {
+ "idx": "00000000",
+ "vec": [
+ "00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "0a", "0b", "0c", "0d", "0e", "0f",
+ "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "1a", "1b", "1c", "1d", "1e", "1f",
+ "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "2a", "2b", "2c", "2d", "2e", "2f",
+ "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "3a", "3b", "3c", "3d", "3e", "3f",
+ "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "4a", "4b", "4c", "4d", "4e", "4f",
+ "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "5a", "5b", "5c", "5d", "5e", "5f",
+ "60", "61", "62", "63", "64", "65", "66", "67", "68", "69", "6a", "6b", "6c", "6d", "6e", "6f",
+ "70", "71", "72", "73", "74", "75", "76", "77", "78", "79", "7a", "7b", "7c", "7d", "7e", "7f",
+ "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "8a", "8b", "8c", "8d", "8e", "8f",
+ "90", "91", "92", "93", "94", "95", "96", "97", "98", "99", "9a", "9b", "9c", "9d", "9e", "9f",
+ "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", "aa", "ab", "ac", "ad", "ae", "af",
+ "b0", "b1", "b2", "b3", "b4", "b5", "b6", "b7", "b8", "b9", "ba", "bb", "bc", "bd", "be", "bf",
+ "c0", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8", "c9", "ca", "cb", "cc", "cd", "ce", "cf",
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "da", "db", "dc", "dd", "de", "df",
+ "e0", "e1", "e2", "e3", "e4", "e5", "e6", "e7", "e8", "e9", "ea", "eb", "ec", "ed", "ee", "ef",
+ "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "fa", "fb", "fc", "fd", "fe", "ff"
+ ]
+ }
+ ],
+ "acc_i": [
+ {
+ "idx": "00000000",
+ "vec": [
+ "00000000",
+ "00000001",
+ "00000002",
+ "00000003",
+ "00000004",
+ "00000005",
+ "00000006",
+ "00000007",
+ "00000008",
+ "00000009",
+ "0000000a",
+ "0000000b",
+ "0000000c",
+ "0000000d",
+ "0000000e",
+ "0000000f"
+ ]
+ },
+ {
+ "idx": "00000001",
+ "vec": [
+ "00000000",
+ "00000001",
+ "00000002",
+ "00000003",
+ "00000004",
+ "00000005",
+ "00000006",
+ "00000007",
+ "00000008",
+ "00000009",
+ "0000000a",
+ "0000000b",
+ "0000000c",
+ "0000000d",
+ "0000000e",
+ "0000000f"
+ ]
+ }
+ ],
+ "acc_o": [
+ {
+ "idx": "00000000",
+ "vec": [
+ "00000000",
+ "00000001",
+ "00000002",
+ "00000003",
+ "00000004",
+ "00000005",
+ "00000006",
+ "00000007",
+ "00000008",
+ "00000009",
+ "0000000a",
+ "0000000b",
+ "0000000c",
+ "0000000d",
+ "0000000e",
+ "0000000f"
+ ]
+ },
+ {
+ "idx": "00000001",
+ "vec": [
+ "00000001",
+ "00000012",
+ "00000023",
+ "00000034",
+ "00000045",
+ "00000056",
+ "00000067",
+ "00000078",
+ "ffffff89",
+ "ffffff9a",
+ "ffffffab",
+ "ffffffbc",
+ "ffffffcd",
+ "ffffffde",
+ "ffffffef",
+ "00000000"
+ ]
+ }
+ ],
+ "uop": [
+ {
+ "idx": "00000000",
+ "vec": [
+ "00000000",
+ "00000000",
+ "00000000"
+ ]
+ },
+ {
+ "idx": "00000001",
+ "vec": [
+ "00000001",
+ "00000001",
+ "00000000"
+ ]
+ }
+ ]
+}
diff --git a/hardware/chisel/src/test/resources/gemm_2uop_overflow_cascaded.json b/hardware/chisel/src/test/resources/gemm_2uop_overflow_cascaded.json
new file mode 100644
index 0000000..9ca3e4f
--- /dev/null
+++ b/hardware/chisel/src/test/resources/gemm_2uop_overflow_cascaded.json
@@ -0,0 +1,188 @@
+{
+ "inst": {
+ "reset": "0",
+ "uop_begin": "0000",
+ "uop_end": "0002",
+ "lp_0": "0001",
+ "lp_1": "0001",
+ "acc_0": "000",
+ "acc_1": "000",
+ "inp_0": "000",
+ "inp_1": "000",
+ "wgt_0": "000",
+ "wgt_1": "000"
+ },
+ "inp": [
+ {
+ "idx": "00000000",
+ "vec": [
+ "01",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00"
+ ]
+ },
+ {
+ "idx": "00000001",
+ "vec": [
+ "ff",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00",
+ "00"
+ ]
+ }
+ ],
+ "wgt": [
+ {
+ "idx": "00000000",
+ "vec": [
+ "00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "0a", "0b", "0c", "0d", "0e", "0f",
+ "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "1a", "1b", "1c", "1d", "1e", "1f",
+ "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "2a", "2b", "2c", "2d", "2e", "2f",
+ "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "3a", "3b", "3c", "3d", "3e", "3f",
+ "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "4a", "4b", "4c", "4d", "4e", "4f",
+ "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "5a", "5b", "5c", "5d", "5e", "5f",
+ "60", "61", "62", "63", "64", "65", "66", "67", "68", "69", "6a", "6b", "6c", "6d", "6e", "6f",
+ "70", "71", "72", "73", "74", "75", "76", "77", "78", "79", "7a", "7b", "7c", "7d", "7e", "7f",
+ "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "8a", "8b", "8c", "8d", "8e", "8f",
+ "90", "91", "92", "93", "94", "95", "96", "97", "98", "99", "9a", "9b", "9c", "9d", "9e", "9f",
+ "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", "aa", "ab", "ac", "ad", "ae", "af",
+ "b0", "b1", "b2", "b3", "b4", "b5", "b6", "b7", "b8", "b9", "ba", "bb", "bc", "bd", "be", "bf",
+ "c0", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8", "c9", "ca", "cb", "cc", "cd", "ce", "cf",
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "da", "db", "dc", "dd", "de", "df",
+ "e0", "e1", "e2", "e3", "e4", "e5", "e6", "e7", "e8", "e9", "ea", "eb", "ec", "ed", "ee", "ef",
+ "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "fa", "fb", "fc", "fd", "fe", "ff"
+ ]
+ }
+ ],
+ "acc_i": [
+ {
+ "idx": "00000000",
+ "vec": [
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000"
+ ]
+ },
+ {
+ "idx": "00000001",
+ "vec": [
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000"
+ ]
+ }
+ ],
+ "acc_o": [
+ {
+ "idx": "00000000",
+ "vec": [
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000"
+ ]
+ },
+ {
+ "idx": "00000001",
+ "vec": [
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000",
+ "00000000"
+ ]
+ }
+ ],
+ "uop": [
+ {
+ "idx": "00000000",
+ "vec": [
+ "00000000",
+ "00000000",
+ "00000000"
+ ]
+ },
+ {
+ "idx": "00000001",
+ "vec": [
+ "00000000",
+ "00000001",
+ "00000000"
+ ]
+ }
+ ]
+}
diff --git a/hardware/chisel/src/test/scala/unittest/GemmTest.scala b/hardware/chisel/src/test/scala/unittest/GemmTest.scala
new file mode 100644
index 0000000..f548389
--- /dev/null
+++ b/hardware/chisel/src/test/scala/unittest/GemmTest.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package unittest
+
+import chisel3._
+import chisel3.util._
+import chisel3.iotesters.PeekPokeTester
+import vta.core._
+import vta.util.config._
+
+class MACTester(c: MAC) extends PeekPokeTester(c) {
+ poke(c.io.a, -1)
+ poke(c.io.b, 7)
+ poke(c.io.c, 10)
+ step(1)
+ expect(c.io.y, 3)
+ poke(c.io.a, -2)
+ poke(c.io.b, 7)
+ poke(c.io.c, 11)
+ step(1)
+ expect(c.io.y, -3)
+}
+
+class MACTest extends GenericTest("MACTest", (p:Parameters) => new MAC(),
+ (c:MAC) => new MACTester(c))
+
+class PipeAdderTester(c: PipeAdder) extends PeekPokeTester(c) {
+ poke(c.io.a, -1)
+ poke(c.io.b, 7)
+ step(1)
+ expect(c.io.y, 6)
+ poke(c.io.a, -2)
+ poke(c.io.b, 7)
+ step(1)
+ expect(c.io.y, 5)
+}
+
+class PipeAdderTest extends GenericTest("PipeAdderTest", (p:Parameters) => new PipeAdder(),
+ (c:PipeAdder) => new PipeAdderTester(c))
+
+class AdderTester(c: Adder) extends PeekPokeTester(c) {
+ poke(c.io.a, -1)
+ poke(c.io.b, 7)
+ expect(c.io.y, 6)
+ step(1)
+
+ poke(c.io.a, -2)
+ poke(c.io.b, 7)
+ expect(c.io.y, 5)
+ step(1)
+}
+
+class AdderTest extends GenericTest("AdderTest", (p:Parameters) => new Adder(),
+ (c:Adder) => new AdderTester(c))
+
+class DotProductTester(c: DotProduct) extends PeekPokeTester(c) {
+ for {i<- 0 until 16} {
+ poke(c.io.a(i), if (i %2 == 0) 1 else -1)
+ poke(c.io.b(i), i)
+ }
+ step(1)
+ for {i<- 0 until 16} {
+ poke(c.io.a(i), if (i %2 == 1) 1 else -1)
+ poke(c.io.b(i), i)
+ }
+ step(1)
+ expect(c.io.y, -8)
+ step(1)
+ expect(c.io.y, 8)
+}
+
+class DotProductTest extends GenericTest("DotProductTest", (p:Parameters) => new DotProduct(),
+ (c:DotProduct) => new DotProductTester(c))
diff --git a/hardware/chisel/src/test/scala/unittest/Generic.scala b/hardware/chisel/src/test/scala/unittest/Generic.scala
old mode 100755
new mode 100644
diff --git a/hardware/chisel/src/test/scala/unittest/TensorAluTest.scala b/hardware/chisel/src/test/scala/unittest/TensorAluTest.scala
old mode 100755
new mode 100644
diff --git a/hardware/chisel/src/test/scala/unittest/TensorGemmJsonTest.scala b/hardware/chisel/src/test/scala/unittest/TensorGemmJsonTest.scala
new file mode 100644
index 0000000..1e4f153
--- /dev/null
+++ b/hardware/chisel/src/test/scala/unittest/TensorGemmJsonTest.scala
@@ -0,0 +1,280 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package unittest
+
+import chisel3._
+import chisel3.util._
+import chisel3.iotesters.PeekPokeTester
+import unittest.util._
+import vta.core._
+import vta.util.config._
+
+import scala.io._
+import scala.language.postfixOps
+import com.fasterxml.jackson.databind.ObjectMapper
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
+
+class TensorGemmJsonTester(c: TensorGemmPipelinedSplit, fn : String = "/x.json")
+ extends PeekPokeTester(c) {
+
+ val bufferedSource = Source.fromURL(getClass.getResource(fn))
+ val mapper = new ObjectMapper() with ScalaObjectMapper
+ mapper.registerModule(DefaultScalaModule)
+ val archState = mapper.readValue[Map[String, Object]](bufferedSource.reader())
+ bufferedSource.close
+
+ val inst = archState("inst").asInstanceOf[Map[String,String]]
+
+ def build_scratchpad(tag: String) : Array[Array[BigInt]] = {
+ val arr = archState(tag).asInstanceOf[Seq[Map[String,Object]]]
+ (
+ for {(m,i) <- arr zipWithIndex} yield {
+ val idx = BigInt(m("idx").asInstanceOf[String], 16)
+ assert(BigInt(i) == idx)
+ val vec = m("vec").asInstanceOf[Seq[String]]
+ (
+ for {v <- vec} yield {
+ BigInt(v, 16)
+ }
+ ).toArray
+ }
+ ).toArray
+ }
+
+ val inp_scratchpad = build_scratchpad("inp")
+ val wgt_scratchpad = build_scratchpad("wgt")
+ val uop_scratchpad = build_scratchpad("uop")
+ val acc_scratchpad = build_scratchpad("acc_i")
+ val acc_o_scratchpad = build_scratchpad("acc_o")
+
+ poke(c.io.start, 0)
+
+ val dec_reset = BigInt(inst("reset"), 16)
+ val uop_begin = BigInt(inst("uop_begin"), 16)
+ val uop_end = BigInt(inst("uop_end"), 16)
+ assert(uop_begin < uop_end)
+ val lp_0 = BigInt(inst("lp_0"), 16)
+ val lp_1 = BigInt(inst("lp_1"), 16)
+ val acc_0 = BigInt(inst("acc_0"), 16)
+ val inp_0 = BigInt(inst("inp_0"), 16)
+ val wgt_0 = BigInt(inst("wgt_0"), 16)
+ val acc_1 = BigInt(inst("acc_1"), 16)
+ val inp_1 = BigInt(inst("inp_1"), 16)
+ val wgt_1 = BigInt(inst("wgt_1"), 16)
+
+ poke(c.io.dec.reset, dec_reset)
+
+ poke(c.io.dec.uop_begin, uop_begin)
+ poke(c.io.dec.uop_end, uop_end)
+ poke(c.io.dec.lp_0, lp_0)
+ poke(c.io.dec.lp_1, lp_1)
+ poke(c.io.dec.acc_0, acc_0)
+ poke(c.io.dec.acc_1, acc_1)
+ poke(c.io.dec.inp_0, inp_0)
+ poke(c.io.dec.inp_1, inp_1)
+ poke(c.io.dec.wgt_0, wgt_0)
+ poke(c.io.dec.wgt_1, wgt_1)
+ // Don't need empty_0,{push,pop}_{next,prev},op
+
+ class TensorMasterMock(tm: TensorMaster, scratchpad : Array[Array[BigInt]]) {
+ poke(tm.rd(0).data.valid, 0)
+ var valid = peek(tm.rd(0).idx.valid)
+ var idx : Int = 0
+ def logical_step() {
+ if (valid == 1) {
+ poke(tm.rd(0).data.valid, 1)
+ val cols = tm.rd(0).data.bits(0).size
+ for {i <- 0 until tm.rd(0).data.bits.size
+ j <- 0 until cols
+ } {
+ poke(tm.rd(0).data.bits(i)(j), scratchpad(idx)(i*cols + j))
+ }
+ } else {
+ poke(tm.rd(0).data.valid, 0)
+ }
+ valid = peek(tm.rd(0).idx.valid)
+ idx = peek(tm.rd(0).idx.bits).toInt
+ }
+ }
+
+ class TensorMasterMockWr(tm: TensorMaster, scratchpad : Array[Array[BigInt]]) {
+ def logical_step() {
+ if (peek(tm.wr(0).valid) == 1) {
+ val idx = peek(tm.wr(0).bits.idx).toInt
+ val cols = tm.wr(0).bits.data(0).size
+ for {
+ i <- 0 until tm.wr(0).bits.data.size
+ j <- 0 until cols
+ } {
+ scratchpad(idx)(i*cols + j) = peek(tm.wr(0).bits.data(i)(j))
+ }
+ }
+ }
+ }
+
+ class UopMasterMock(um: UopMaster, scratchpad: Array[Array[BigInt]]) {
+ poke(um.data.valid, 0)
+ var valid = peek(um.idx.valid)
+ var idx : Int = 0
+ def logical_step() {
+ if (valid == 1) {
+ poke(um.data.valid, 1)
+ poke(um.data.bits.u0, scratchpad(idx)(0))
+ poke(um.data.bits.u1, scratchpad(idx)(1))
+ poke(um.data.bits.u2, scratchpad(idx)(2))
+ } else {
+ poke(um.data.valid, 0)
+ }
+ valid = peek(um.idx.valid)
+ idx = peek(um.idx.bits).toInt
+ }
+ }
+
+ class Mocks {
+ val uop_mock = new UopMasterMock(c.io.uop, uop_scratchpad)
+ val inp_mock = new TensorMasterMock(c.io.inp, inp_scratchpad)
+ val wgt_mock = new TensorMasterMock(c.io.wgt, wgt_scratchpad)
+ val acc_mock = new TensorMasterMock(c.io.acc, acc_scratchpad)
+ val acc_mock_wr = new TensorMasterMockWr(c.io.acc, acc_scratchpad)
+
+ val uop_indices = new scala.collection.mutable.Queue[BigInt]
+ val acc_indices = new scala.collection.mutable.Queue[BigInt]
+ val inp_indices = new scala.collection.mutable.Queue[BigInt]
+ val wgt_indices = new scala.collection.mutable.Queue[BigInt]
+ val accout_indices = new scala.collection.mutable.Queue[BigInt]
+ val out_indices = new scala.collection.mutable.Queue[BigInt]
+
+ def logical_step() {
+ step(1)
+ uop_mock.logical_step()
+ inp_mock.logical_step()
+ wgt_mock.logical_step()
+ acc_mock.logical_step()
+ acc_mock_wr.logical_step()
+
+ if (peek(c.io.uop.idx.valid) == 1) {
+ expect(c.io.uop.idx.bits, uop_indices.dequeue())
+ }
+ if (peek(c.io.acc.rd(0).idx.valid) == 1) {
+ expect(c.io.acc.rd(0).idx.bits, acc_indices.dequeue())
+ }
+ if (peek(c.io.inp.rd(0).idx.valid) == 1) {
+ expect(c.io.inp.rd(0).idx.bits, inp_indices.dequeue())
+ }
+ if (peek(c.io.wgt.rd(0).idx.valid) == 1) {
+ expect(c.io.wgt.rd(0).idx.bits, wgt_indices.dequeue())
+ }
+ if (peek(c.io.acc.wr(0).valid) == 1) {
+ expect(c.io.acc.wr(0).bits.idx, accout_indices.dequeue())
+ }
+ if (peek(c.io.out.wr(0).valid) == 1) {
+ expect(c.io.out.wr(0).bits.idx, out_indices.dequeue())
+ }
+ }
+
+ def test_if_done() {
+ println(s"uop_indices should be empty ${uop_indices.size}")
+ println(s"acc_indices should be empty ${acc_indices.size}")
+ println(s"inp_indices should be empty ${inp_indices.size}")
+ println(s"wgt_indices should be empty ${wgt_indices.size}")
+ println(s"accout_indices should be empty ${accout_indices.size}")
+ println(s"out_indices should be empty ${out_indices.size}")
+ }
+
+ def check() = {
+ val result = for {
+ ((x,y),idx) <- (acc_scratchpad, acc_o_scratchpad).zipped.toList.zipWithIndex
+ } yield {
+ (for {((xx,yy),jdx) <- (x,y).zipped.toList.zipWithIndex} yield {
+ if (xx != yy) {
+ println(s"Value mismatch at $idx $jdx: $xx (actual) != $yy (expected)")
+ }
+ xx == yy
+ }).reduce((x,y) => x&&y)
+ }
+ val result2 = result.reduce((x,y) => x&&y)
+ result2
+ }
+ }
+
+ val mocks = new Mocks
+
+ for {
+ cnt_o <- BigInt(0) until lp_0
+ cnt_i <- BigInt(0) until lp_1
+ uop_idx <- uop_begin until uop_end
+ } {
+ val u0 = uop_scratchpad(uop_idx.toInt)(0)
+ val u1 = uop_scratchpad(uop_idx.toInt)(1)
+ val u2 = uop_scratchpad(uop_idx.toInt)(2)
+
+ mocks.uop_indices.enqueue(uop_idx)
+ mocks.acc_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+ mocks.inp_indices.enqueue(u1 + inp_0*cnt_o + inp_1*cnt_i)
+ mocks.wgt_indices.enqueue(u2 + wgt_0*cnt_o + wgt_1*cnt_i)
+ mocks.accout_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+
+ if (dec_reset == 0) {
+ mocks.out_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+ }
+ }
+
+ poke(c.io.start, 0)
+ mocks.logical_step()
+ expect(c.io.state, c.sIdle)
+ poke(c.io.start, 1)
+
+ val total_steps = (uop_end-uop_begin)*lp_0*lp_1
+
+ val max_count = 100 + 4*total_steps
+ var count = 0
+ while (peek(c.io.done) == 0 && count < max_count) {
+ if (count % 100 == 0) {
+ println(s"logical_step $count")
+ }
+ mocks.logical_step()
+ if (count == 0) {
+ poke(c.io.start, 0)
+ }
+ count += 1
+ }
+
+ assert(peek(c.io.done) == 1, s"Signal done never high even after $count steps.")
+ println(s"Signal done high after $count steps.")
+
+ mocks.logical_step()
+ expect(c.io.done, 0)
+
+ val cc = mocks.check()
+ println(s"Checking acc with acc_o ${cc}")
+ assert(cc)
+
+ println(s"Total active steps: ${total_steps}")
+ mocks.test_if_done()
+}
+
+class TensorGemmJsonTestSingleUopOverflowOffset extends GenericTest("TensorGemmJson", (p:Parameters) =>
+ new TensorGemmPipelinedSplit()(p),
+ (c:TensorGemmPipelinedSplit) => new TensorGemmJsonTester(c, "/gemm_1uop_overflow_offset.json"))
+
+class TensorGemmJsonTestDoubleUopOverflowCascaded extends GenericTest("TensorGemmJson", (p:Parameters) =>
+ new TensorGemmPipelinedSplit()(p),
+ (c:TensorGemmPipelinedSplit) => new TensorGemmJsonTester(c, "/gemm_2uop_overflow_cascaded.json"))
diff --git a/hardware/chisel/src/test/scala/unittest/TensorGemmTest.scala b/hardware/chisel/src/test/scala/unittest/TensorGemmTest.scala
new file mode 100644
index 0000000..6b2234c
--- /dev/null
+++ b/hardware/chisel/src/test/scala/unittest/TensorGemmTest.scala
@@ -0,0 +1,742 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package unittest
+
+import chisel3._
+import chisel3.util._
+import chisel3.iotesters.PeekPokeTester
+import unittest.util._
+import vta.core._
+import vta.util.config._
+
+class TensorGemmTester(c: TensorGemmSimple) extends PeekPokeTester(c) {
+ poke(c.io.start, 0)
+ poke(c.io.dec.reset, 0)
+ poke(c.io.dec.uop_begin, 0)
+ poke(c.io.dec.uop_end, 1)
+ poke(c.io.dec.lp_0, 1)
+ poke(c.io.dec.lp_1, 1)
+ poke(c.io.dec.acc_0, 1)
+ poke(c.io.dec.acc_1, 1)
+ poke(c.io.dec.inp_0, 1)
+ poke(c.io.dec.inp_1, 1)
+ poke(c.io.dec.wgt_0, 1)
+ poke(c.io.dec.wgt_1, 1)
+ // Don't need empty_0, {push, pop}_{next, prev}, op
+
+ poke(c.io.uop.data.bits.u0, 0)
+ poke(c.io.uop.data.bits.u1, 0)
+ poke(c.io.uop.data.bits.u2, 0)
+
+ val inp = IndexedSeq.fill(c.io.inp.rd(0).data.bits(0).size){BigInt(1)}
+ for {lhs <- c.io.inp.rd(0).data.bits} {
+ poke(lhs, inp.reverse)
+ }
+
+ val wgt = IndexedSeq.fill(c.io.wgt.rd(0).data.bits(0).size){BigInt(1)}
+ for {lhs <- c.io.wgt.rd(0).data.bits} {
+ poke(lhs, wgt.reverse)
+ }
+
+ val acc = IndexedSeq.fill(c.io.acc.rd(0).data.bits(0).size){BigInt(1)}
+ for {lhs <- c.io.acc.rd(0).data.bits} {
+ poke(lhs, acc.reverse)
+ }
+
+ class TensorMasterMock(tm: TensorMaster) {
+ poke(tm.rd(0).data.valid, 0)
+ var valid = peek(tm.rd(0).idx.valid)
+
+ def logical_step(v: BigInt) {
+ poke(tm.rd(0).data.valid, valid)
+ valid = peek(tm.rd(0).idx.valid)
+ expect(tm.rd(0).idx.valid, v)
+ }
+ }
+
+ class UopMasterMock(um: UopMaster) {
+ poke(um.data.valid, 0)
+ var valid = peek(um.idx.valid)
+
+ def logical_step(v: BigInt) {
+ poke(um.data.valid, valid)
+ valid = peek(um.idx.valid)
+ expect(um.idx.valid, v)
+ }
+ }
+
+ class Mocks {
+ val uop_mock = new UopMasterMock(c.io.uop)
+ val inp_mock = new TensorMasterMock(c.io.inp)
+ val wgt_mock = new TensorMasterMock(c.io.wgt)
+ val acc_mock = new TensorMasterMock(c.io.acc)
+
+ def logical_step(sram_valid: BigInt, uop_valid: BigInt) {
+ step(1)
+ uop_mock.logical_step(uop_valid)
+ inp_mock.logical_step(sram_valid)
+ wgt_mock.logical_step(sram_valid)
+ acc_mock.logical_step(sram_valid)
+ }
+ }
+
+ val mocks = new Mocks
+ poke(c.io.start, 0)
+
+ step(1)
+
+ expect(c.io.state, c.sIdle)
+
+ poke(c.io.start, 1)
+ mocks.logical_step(0, 1)
+ expect(c.io.state, c.sReadUop)
+
+ expect(c.io.out.wr(0).valid, 0)
+ expect(c.io.acc.wr(0).valid, 0)
+
+ poke(c.io.start, 0)
+
+ mocks.logical_step(0, 0)
+ expect(c.io.state, c.sComputeIdx)
+ expect(c.io.out.wr(0).valid, 0)
+ expect(c.io.acc.wr(0).valid, 0)
+
+ mocks.logical_step(1, 0)
+ expect(c.io.state, c.sReadTensor)
+ expect(c.io.out.wr(0).valid, 0)
+ expect(c.io.acc.wr(0).valid, 0)
+
+ mocks.logical_step(0, 0)
+ expect(c.io.state, c.sExe)
+ expect(c.io.out.wr(0).valid, 0)
+ expect(c.io.acc.wr(0).valid, 0)
+ expect(c.io.done, 0)
+
+ mocks.logical_step(0, 0)
+ expect(c.io.state, c.sWait)
+ expect(c.io.inflight, 1)
+
+ expect(c.io.out.wr(0).valid, 0)
+ expect(c.io.acc.wr(0).valid, 0)
+
+ mocks.logical_step(0, 0)
+ expect(c.io.state, c.sWait)
+ expect(c.io.inflight, 1)
+
+ expect(c.io.out.wr(0).valid, 1)
+ expect(c.io.acc.wr(0).valid, 1)
+
+ mocks.logical_step(0, 0)
+ expect(c.io.state, c.sWait)
+ expect(c.io.inflight, 0)
+
+ expect(c.io.out.wr(0).valid, 0)
+ expect(c.io.acc.wr(0).valid, 0)
+
+ mocks.logical_step(0, 0)
+ expect(c.io.state, c.sIdle)
+ expect(c.io.inflight, 0)
+
+ expect(c.io.out.wr(0).valid, 0)
+ expect(c.io.acc.wr(0).valid, 0)
+
+}
+
+class TensorGemmTest extends GenericTest("TensorGemm", (p:Parameters) => new TensorGemmSimple()(p),
+ (c:TensorGemmSimple) => new TensorGemmTester(c))
+
+class TensorGemmIdxTester(c: TensorGemmSimple) extends PeekPokeTester(c) {
+
+ poke(c.io.start, 0)
+
+ val uop_begin = 0
+ val uop_end = 2
+ assert(uop_begin < uop_end)
+ val lp_0 = 2
+ val lp_1 = 3
+ val acc_0 = 1*lp_1
+ val inp_0 = 2*lp_1
+ val wgt_0 = 4*lp_1
+ val acc_1 = 1
+ val inp_1 = 2
+ val wgt_1 = 4
+ val u0 = BigInt("000", 16)
+ val u1 = BigInt("100", 16)
+ val u2 = BigInt("200", 16)
+
+ poke(c.io.dec.reset, 0)
+ poke(c.io.dec.uop_begin, uop_begin)
+ poke(c.io.dec.uop_end, uop_end)
+ poke(c.io.dec.lp_0, lp_0)
+ poke(c.io.dec.lp_1, lp_1)
+ poke(c.io.dec.acc_0, acc_0)
+ poke(c.io.dec.acc_1, acc_1)
+ poke(c.io.dec.inp_0, inp_0)
+ poke(c.io.dec.inp_1, inp_1)
+ poke(c.io.dec.wgt_0, wgt_0)
+ poke(c.io.dec.wgt_1, wgt_1)
+ // Don't need empty_0,{push,pop}_{next,prev},op
+
+ poke(c.io.uop.data.bits.u0, u0)
+ poke(c.io.uop.data.bits.u1, u1)
+ poke(c.io.uop.data.bits.u2, u2)
+
+ val inp = IndexedSeq.fill(c.io.inp.rd(0).data.bits(0).size){BigInt(1)}
+ for {lhs <- c.io.inp.rd(0).data.bits} {
+ poke(lhs, inp.reverse)
+ }
+
+ val wgt = IndexedSeq.fill(c.io.wgt.rd(0).data.bits(0).size){BigInt(1)}
+ for {lhs <- c.io.wgt.rd(0).data.bits} {
+ poke(lhs, wgt.reverse)
+ }
+
+ val acc = IndexedSeq.fill(c.io.acc.rd(0).data.bits(0).size){BigInt(1)}
+ for {lhs <- c.io.acc.rd(0).data.bits} {
+ poke(lhs, acc.reverse)
+ }
+
+ class TensorMasterMock(tm: TensorMaster) {
+ poke(tm.rd(0).data.valid, 0)
+ var valid = peek(tm.rd(0).idx.valid)
+ def logical_step(v: BigInt) {
+ poke(tm.rd(0).data.valid, valid)
+ valid = peek(tm.rd(0).idx.valid)
+ expect(tm.rd(0).idx.valid, v)
+ }
+ }
+
+ class UopMasterMock(um: UopMaster) {
+ poke(um.data.valid, 0)
+ var valid = peek(um.idx.valid)
+ def logical_step(v: BigInt) {
+ poke(um.data.valid, valid)
+ valid = peek(um.idx.valid)
+ expect(um.idx.valid, v)
+ }
+ }
+
+ class Mocks {
+ val uop_mock = new UopMasterMock(c.io.uop)
+ val inp_mock = new TensorMasterMock(c.io.inp)
+ val wgt_mock = new TensorMasterMock(c.io.wgt)
+ val acc_mock = new TensorMasterMock(c.io.acc)
+
+ val uop_indices = new scala.collection.mutable.Queue[BigInt]
+ val acc_indices = new scala.collection.mutable.Queue[BigInt]
+ val inp_indices = new scala.collection.mutable.Queue[BigInt]
+ val wgt_indices = new scala.collection.mutable.Queue[BigInt]
+ val accout_indices = new scala.collection.mutable.Queue[BigInt]
+ val out_indices = new scala.collection.mutable.Queue[BigInt]
+
+ def logical_step(sram_valid: BigInt, uop_valid: BigInt) {
+ step(1)
+ uop_mock.logical_step(uop_valid)
+ inp_mock.logical_step(sram_valid)
+ wgt_mock.logical_step(sram_valid)
+ acc_mock.logical_step(sram_valid)
+ if (peek(c.io.uop.idx.valid) == 1) {
+ expect(c.io.uop.idx.bits, uop_indices.dequeue())
+ }
+ if (peek(c.io.acc.rd(0).idx.valid) == 1) {
+ expect(c.io.acc.rd(0).idx.bits, acc_indices.dequeue())
+ }
+ if (peek(c.io.inp.rd(0).idx.valid) == 1) {
+ expect(c.io.inp.rd(0).idx.bits, inp_indices.dequeue())
+ }
+ if (peek(c.io.wgt.rd(0).idx.valid) == 1) {
+ expect(c.io.wgt.rd(0).idx.bits, wgt_indices.dequeue())
+ }
+ if (peek(c.io.acc.wr(0).valid) == 1) {
+ expect(c.io.acc.wr(0).bits.idx, accout_indices.dequeue())
+ }
+ if (peek(c.io.out.wr(0).valid) == 1) {
+ expect(c.io.out.wr(0).bits.idx, out_indices.dequeue())
+ }
+ }
+
+ def test_if_done() {
+ assert(uop_indices.isEmpty)
+ assert(acc_indices.isEmpty)
+ assert(inp_indices.isEmpty)
+ assert(wgt_indices.isEmpty)
+ }
+ }
+
+ val mocks = new Mocks
+ for {
+ cnt_o <- 0 until lp_0
+ cnt_i <- 0 until lp_1
+ uop_idx <- uop_begin until uop_end
+ } {
+ mocks.uop_indices.enqueue(uop_idx)
+ mocks.acc_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+ mocks.inp_indices.enqueue(u1 + inp_0*cnt_o + inp_1*cnt_i)
+ mocks.wgt_indices.enqueue(u2 + wgt_0*cnt_o + wgt_1*cnt_i)
+ mocks.accout_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+ mocks.out_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+ }
+
+ poke(c.io.start, 0)
+ step(1)
+ expect(c.io.state, c.sIdle)
+
+ poke(c.io.start, 1)
+
+ for {q <- 0 until (uop_end-uop_begin)*lp_0*lp_1} {
+ mocks.logical_step(0, 1)
+ expect(c.io.out.wr(0).valid, 0)
+ expect(c.io.acc.wr(0).valid, 0)
+
+ poke(c.io.start, 0)
+
+ mocks.logical_step(0, 0)
+ expect(c.io.out.wr(0).valid, if (q > 0) 1 else 0)
+ expect(c.io.acc.wr(0).valid, if (q > 0) 1 else 0)
+
+ mocks.logical_step(1, 0)
+ expect(c.io.out.wr(0).valid, 0)
+ expect(c.io.acc.wr(0).valid, 0)
+
+ mocks.logical_step(0, 0)
+ expect(c.io.out.wr(0).valid, 0)
+ expect(c.io.acc.wr(0).valid, 0)
+ expect(c.io.done, 0)
+ }
+
+ mocks.logical_step(0, 0)
+ expect(c.io.inflight, 1)
+
+ expect(c.io.out.wr(0).valid, 0)
+ expect(c.io.acc.wr(0).valid, 0)
+
+ mocks.logical_step(0, 0)
+ expect(c.io.inflight, 1)
+
+ expect(c.io.out.wr(0).valid, 1)
+ expect(c.io.acc.wr(0).valid, 1)
+
+ mocks.logical_step(0, 0)
+ expect(c.io.inflight, 0)
+
+ expect(c.io.out.wr(0).valid, 0)
+ expect(c.io.acc.wr(0).valid, 0)
+
+ mocks.logical_step(0, 0)
+ expect(c.io.inflight, 0)
+
+ expect(c.io.out.wr(0).valid, 0)
+ expect(c.io.acc.wr(0).valid, 0)
+
+ mocks.test_if_done()
+}
+
+class TensorGemmIdxTest extends GenericTest("TensorGemmIdx", (p:Parameters) => new TensorGemmSimple()(p),
+ (c:TensorGemmSimple) => new TensorGemmIdxTester(c))
+
+class TensorGemmIndexGeneratorTester(c: TensorGemmIndexGenerator) extends PeekPokeTester(c) {
+ val uop_begin = 0
+ val uop_end = 2
+ assert(uop_begin < uop_end)
+ val lp_0 = 2
+ val lp_1 = 3
+ val acc_0 = 1*lp_1
+ val inp_0 = 2*lp_1
+ val wgt_0 = 4*lp_1
+ val acc_1 = 1
+ val inp_1 = 2
+ val wgt_1 = 4
+
+ poke(c.io.dec.reset, 0)
+ poke(c.io.dec.uop_begin, uop_begin)
+ poke(c.io.dec.uop_end, uop_end)
+ poke(c.io.dec.lp_0, lp_0)
+ poke(c.io.dec.lp_1, lp_1)
+ poke(c.io.dec.acc_0, acc_0)
+ poke(c.io.dec.acc_1, acc_1)
+ poke(c.io.dec.inp_0, inp_0)
+ poke(c.io.dec.inp_1, inp_1)
+ poke(c.io.dec.wgt_0, wgt_0)
+ poke(c.io.dec.wgt_1, wgt_1)
+ // Don't need empty_0,{push,pop}_{next,prev},op
+
+ class Mocks {
+ val uop_indices = new scala.collection.mutable.Queue[BigInt]
+ val acc_indices = new scala.collection.mutable.Queue[BigInt]
+ val inp_indices = new scala.collection.mutable.Queue[BigInt]
+ val wgt_indices = new scala.collection.mutable.Queue[BigInt]
+
+ def logical_step() {
+ step(1)
+ if (peek(c.io.valid) == 1) {
+ expect(c.io.uop_idx, uop_indices.dequeue())
+ expect(c.io.acc_i, acc_indices.dequeue())
+ expect(c.io.inp_i, inp_indices.dequeue())
+ expect(c.io.wgt_i, wgt_indices.dequeue())
+ }
+ }
+
+ def test_if_done() {
+ println(s"uop_indices remaining: ${uop_indices.size}")
+ println(s"acc_indices remaining: ${acc_indices.size}")
+ println(s"inp_indices remaining: ${inp_indices.size}")
+ println(s"wgt_indices remaining: ${wgt_indices.size}")
+ assert(uop_indices.isEmpty)
+ assert(acc_indices.isEmpty)
+ assert(inp_indices.isEmpty)
+ assert(wgt_indices.isEmpty)
+ }
+ }
+
+ val mocks = new Mocks
+ for {
+ cnt_o <- 0 until lp_0
+ cnt_i <- 0 until lp_1
+ uop_idx <- uop_begin until uop_end
+ } {
+ mocks.uop_indices.enqueue(uop_idx)
+ mocks.acc_indices.enqueue(acc_0*cnt_o + acc_1*cnt_i)
+ mocks.inp_indices.enqueue(inp_0*cnt_o + inp_1*cnt_i)
+ mocks.wgt_indices.enqueue(wgt_0*cnt_o + wgt_1*cnt_i)
+ }
+
+ poke(c.io.start, 1)
+ mocks.logical_step()
+ poke(c.io.start, 0)
+
+ val end = (uop_end-uop_begin)*lp_0*lp_1
+ var count = 0
+ while(peek(c.io.last) == 0 && count < 10*end + 100) {
+ mocks.logical_step()
+ count += 1
+ }
+ mocks.test_if_done()
+}
+
+class TensorGemmIndexGeneratorTest extends GenericTest("TensorGemmIndexGenerator",
+ (p:Parameters) => new TensorGemmIndexGenerator()(p),
+ (c:TensorGemmIndexGenerator) => new TensorGemmIndexGeneratorTester(c))
+
+class TensorGemmPipelinedTester(c: TensorGemmPipelinedSplit) extends PeekPokeTester(c) {
+ poke(c.io.start, 0)
+
+ val uop_begin = 0
+ val uop_end = 2
+ assert(uop_begin < uop_end)
+ val lp_0 = 2
+ val lp_1 = 3
+ val acc_0 = 1*lp_1
+ val inp_0 = 2*lp_1
+ val wgt_0 = 4*lp_1
+ val acc_1 = 1
+ val inp_1 = 2
+ val wgt_1 = 4
+ val u0 = BigInt("000", 16)
+ val u1 = BigInt("100", 16)
+ val u2 = BigInt("200", 16)
+
+ poke(c.io.dec.reset, 0)
+ poke(c.io.dec.uop_begin, uop_begin)
+ poke(c.io.dec.uop_end, uop_end)
+ poke(c.io.dec.lp_0, lp_0)
+ poke(c.io.dec.lp_1, lp_1)
+ poke(c.io.dec.acc_0, acc_0)
+ poke(c.io.dec.acc_1, acc_1)
+ poke(c.io.dec.inp_0, inp_0)
+ poke(c.io.dec.inp_1, inp_1)
+ poke(c.io.dec.wgt_0, wgt_0)
+ poke(c.io.dec.wgt_1, wgt_1)
+ // Don't need empty_0,{push,pop}_{next,prev},op
+
+ poke(c.io.uop.data.bits.u0, u0)
+ poke(c.io.uop.data.bits.u1, u1)
+ poke(c.io.uop.data.bits.u2, u2)
+
+ val inp = IndexedSeq.fill(c.io.inp.rd(0).data.bits(0).size){BigInt(1)}
+ for {lhs <- c.io.inp.rd(0).data.bits} {
+ poke(lhs, inp.reverse)
+ }
+
+ val wgt = IndexedSeq.fill(c.io.wgt.rd(0).data.bits(0).size){BigInt(1)}
+ for {lhs <- c.io.wgt.rd(0).data.bits} {
+ poke(lhs, wgt.reverse)
+ }
+
+ val acc = IndexedSeq.fill(c.io.acc.rd(0).data.bits(0).size){BigInt(1)}
+ for {lhs <- c.io.acc.rd(0).data.bits} {
+ poke(lhs, acc.reverse)
+ }
+
+ class TensorMasterMock(tm: TensorMaster) {
+ poke(tm.rd(0).data.valid, 0)
+ var valid = peek(tm.rd(0).idx.valid)
+ def logical_step(v: Option[BigInt]) {
+ poke(tm.rd(0).data.valid, valid)
+ valid = peek(tm.rd(0).idx.valid)
+ for {x <- v} expect(tm.rd(0).idx.valid, x)
+ }
+ }
+
+ class UopMasterMock(um: UopMaster) {
+ poke(um.data.valid, 0)
+ var valid = peek(um.idx.valid)
+ def logical_step(v: Option[BigInt]) {
+ poke(um.data.valid, valid)
+ valid = peek(um.idx.valid)
+ for {x <- v} expect(um.idx.valid, x)
+ }
+ }
+
+ class Mocks {
+ val uop_mock = new UopMasterMock(c.io.uop)
+ val inp_mock = new TensorMasterMock(c.io.inp)
+ val wgt_mock = new TensorMasterMock(c.io.wgt)
+ val acc_mock = new TensorMasterMock(c.io.acc)
+
+ val uop_indices = new scala.collection.mutable.Queue[BigInt]
+ val acc_indices = new scala.collection.mutable.Queue[BigInt]
+ val inp_indices = new scala.collection.mutable.Queue[BigInt]
+ val wgt_indices = new scala.collection.mutable.Queue[BigInt]
+ val accout_indices = new scala.collection.mutable.Queue[BigInt]
+ val out_indices = new scala.collection.mutable.Queue[BigInt]
+
+ def logical_step() {
+ step(1)
+ uop_mock.logical_step(None)
+ inp_mock.logical_step(None)
+ wgt_mock.logical_step(None)
+ acc_mock.logical_step(None)
+ if (peek(c.io.uop.idx.valid) == 1) {
+ expect(c.io.uop.idx.bits, uop_indices.dequeue())
+ }
+ if (peek(c.io.acc.rd(0).idx.valid) == 1) {
+ expect(c.io.acc.rd(0).idx.bits, acc_indices.dequeue())
+ }
+ if (peek(c.io.inp.rd(0).idx.valid) == 1) {
+ expect(c.io.inp.rd(0).idx.bits, inp_indices.dequeue())
+ }
+ if (peek(c.io.wgt.rd(0).idx.valid) == 1) {
+ expect(c.io.wgt.rd(0).idx.bits, wgt_indices.dequeue())
+ }
+ if (peek(c.io.acc.wr(0).valid) == 1) {
+ expect(c.io.acc.wr(0).bits.idx, accout_indices.dequeue())
+ }
+ if (peek(c.io.out.wr(0).valid) == 1) {
+ expect(c.io.out.wr(0).bits.idx, out_indices.dequeue())
+ }
+ }
+
+ def test_if_done() {
+ println(s"uop_indices remaining: ${uop_indices.size}")
+ println(s"acc_indices remaining: ${acc_indices.size}")
+ println(s"inp_indices remaining: ${inp_indices.size}")
+ println(s"wgt_indices remaining: ${wgt_indices.size}")
+ println(s"accout_indices remaining: ${accout_indices.size}")
+ println(s"out_indices remaining: ${out_indices.size}")
+ }
+ }
+
+ val mocks = new Mocks
+ for {
+ cnt_o <- 0 until lp_0
+ cnt_i <- 0 until lp_1
+ uop_idx <- uop_begin until uop_end
+ } {
+ mocks.uop_indices.enqueue(uop_idx)
+ mocks.acc_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+ mocks.inp_indices.enqueue(u1 + inp_0*cnt_o + inp_1*cnt_i)
+ mocks.wgt_indices.enqueue(u2 + wgt_0*cnt_o + wgt_1*cnt_i)
+ mocks.accout_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+ mocks.out_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+ }
+
+ poke(c.io.start, 0)
+ step(1)
+ expect(c.io.state, c.sIdle)
+ poke(c.io.start, 1)
+
+ var count = 0
+ val end = (uop_end-uop_begin)*lp_0*lp_1
+
+ while (peek(c.io.done) == 0 && count < 10*end + 100) {
+ mocks.logical_step()
+ poke(c.io.start, 0)
+ }
+
+ expect(c.io.done, 1)
+ mocks.test_if_done()
+}
+
+class TensorGemmPipelinedTest extends GenericTest("TensorGemmPipelined",
+ (p:Parameters) => new TensorGemmPipelinedSplit()(p),
+ (c:TensorGemmPipelinedSplit) => new TensorGemmPipelinedTester(c))
+
+class TensorGemmResetTester(c: TensorGemm) extends PeekPokeTester(c) {
+ poke(c.io.start, 0)
+
+ val uop_begin = 0
+ val uop_end = 2
+ assert(uop_begin < uop_end)
+ val lp_0 = 2
+ val lp_1 = 3
+ val acc_0 = 1*lp_1
+ val inp_0 = 2*lp_1
+ val wgt_0 = 4*lp_1
+ val acc_1 = 1
+ val inp_1 = 2
+ val wgt_1 = 4
+ val u0 = BigInt("000", 16)
+ val u1 = BigInt("100", 16)
+ val u2 = BigInt("200", 16)
+ val dec_reset = 1
+
+ poke(c.io.dec.reset, dec_reset)
+ poke(c.io.dec.uop_begin, uop_begin)
+ poke(c.io.dec.uop_end, uop_end)
+ poke(c.io.dec.lp_0, lp_0)
+ poke(c.io.dec.lp_1, lp_1)
+ poke(c.io.dec.acc_0, acc_0)
+ poke(c.io.dec.acc_1, acc_1)
+ poke(c.io.dec.inp_0, inp_0)
+ poke(c.io.dec.inp_1, inp_1)
+ poke(c.io.dec.wgt_0, wgt_0)
+ poke(c.io.dec.wgt_1, wgt_1)
+ // Don't need empty_0,{push,pop}_{next,prev},op
+
+ poke(c.io.uop.data.bits.u0, u0)
+ poke(c.io.uop.data.bits.u1, u1)
+ poke(c.io.uop.data.bits.u2, u2)
+
+ val inp = IndexedSeq.fill(c.io.inp.rd(0).data.bits(0).size){BigInt(1)}
+ for {lhs <- c.io.inp.rd(0).data.bits} {
+ poke(lhs, inp.reverse)
+ }
+
+ val wgt = IndexedSeq.fill(c.io.wgt.rd(0).data.bits(0).size){BigInt(1)}
+ for {lhs <- c.io.wgt.rd(0).data.bits} {
+ poke(lhs, wgt.reverse)
+ }
+
+ val acc = IndexedSeq.fill(c.io.acc.rd(0).data.bits(0).size){BigInt(1)}
+ for {lhs <- c.io.acc.rd(0).data.bits} {
+ poke(lhs, acc.reverse)
+ }
+
+ class TensorMasterMock(tm: TensorMaster) {
+ poke(tm.rd(0).data.valid, 0)
+ var valid = peek(tm.rd(0).idx.valid)
+ def logical_step(v: Option[BigInt]) {
+ poke(tm.rd(0).data.valid, valid)
+ valid = peek(tm.rd(0).idx.valid)
+ for {x <- v} expect(tm.rd(0).idx.valid, x)
+ }
+ }
+
+ class UopMasterMock(um: UopMaster) {
+ poke(um.data.valid, 0)
+ var valid = peek(um.idx.valid)
+ def logical_step(v: Option[BigInt]) {
+ poke(um.data.valid, valid)
+ valid = peek(um.idx.valid)
+ for {x <- v} expect(um.idx.valid, x)
+ }
+ }
+
+ class Mocks {
+ val uop_mock = new UopMasterMock(c.io.uop)
+ val inp_mock = new TensorMasterMock(c.io.inp)
+ val wgt_mock = new TensorMasterMock(c.io.wgt)
+ val acc_mock = new TensorMasterMock(c.io.acc)
+
+ val uop_indices = new scala.collection.mutable.Queue[BigInt]
+ val acc_indices = new scala.collection.mutable.Queue[BigInt]
+ val inp_indices = new scala.collection.mutable.Queue[BigInt]
+ val wgt_indices = new scala.collection.mutable.Queue[BigInt]
+ val accout_indices = new scala.collection.mutable.Queue[BigInt]
+ val out_indices = new scala.collection.mutable.Queue[BigInt]
+
+ def logical_step(sram_valid: BigInt, uop_valid: BigInt) {
+ step(1)
+ uop_mock.logical_step(None)
+ inp_mock.logical_step(None)
+ wgt_mock.logical_step(None)
+ acc_mock.logical_step(None)
+ if (peek(c.io.uop.idx.valid) == 1) {
+ expect(c.io.uop.idx.bits, uop_indices.dequeue())
+ }
+ if (peek(c.io.acc.rd(0).idx.valid) == 1) {
+ expect(c.io.acc.rd(0).idx.bits, acc_indices.dequeue())
+ }
+ if (peek(c.io.inp.rd(0).idx.valid) == 1) {
+ expect(c.io.inp.rd(0).idx.bits, inp_indices.dequeue())
+ }
+ if (peek(c.io.wgt.rd(0).idx.valid) == 1) {
+ expect(c.io.wgt.rd(0).idx.bits, wgt_indices.dequeue())
+ }
+ if (peek(c.io.acc.wr(0).valid) == 1) {
+ expect(c.io.acc.wr(0).bits.idx, accout_indices.dequeue())
+ }
+ if (peek(c.io.out.wr(0).valid) == 1) {
+ expect(c.io.out.wr(0).bits.idx, out_indices.dequeue())
+ }
+ }
+
+ def test_if_done() {
+ assert(uop_indices.isEmpty)
+ assert(acc_indices.isEmpty)
+ assert(inp_indices.isEmpty)
+ assert(wgt_indices.isEmpty)
+ assert(accout_indices.isEmpty)
+ assert(out_indices.isEmpty)
+ }
+ }
+
+ val mocks = new Mocks
+ for {
+ cnt_o <- 0 until lp_0
+ cnt_i <- 0 until lp_1
+ uop_idx <- uop_begin until uop_end
+ } {
+ mocks.uop_indices.enqueue(uop_idx)
+ mocks.acc_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+ mocks.inp_indices.enqueue(u1 + inp_0*cnt_o + inp_1*cnt_i)
+ mocks.wgt_indices.enqueue(u2 + wgt_0*cnt_o + wgt_1*cnt_i)
+ mocks.accout_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+
+ if (dec_reset == 0) {
+ mocks.out_indices.enqueue(u0 + acc_0*cnt_o + acc_1*cnt_i)
+ }
+ }
+
+ poke(c.io.start, 0)
+ step(1)
+ expect(c.io.state, c.sIdle)
+ poke(c.io.start, 1)
+
+ while(peek(c.io.done) == 0) {
+ mocks.logical_step(0, 0)
+ poke(c.io.start, 0)
+ }
+
+ mocks.test_if_done()
+}
+
+class TensorGemmResetTest extends GenericTest("TensorGemmReset", (p:Parameters) => new TensorGemm()(p),
+ (c:TensorGemm) => new TensorGemmResetTester(c))