hardware/chisel/src/main/scala/core/TensorAlu.scala - tvm-vta - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package vta.core

 import chisel3._
 import chisel3.util._
 import vta.util.config._

 /** ALU datapath */
 class Alu(implicit p: Parameters) extends Module {
   val aluBits = p(CoreKey).accBits
   val io = IO(new Bundle {
     val opcode = Input(UInt(C_ALU_OP_BITS.W))
     val a = Input(SInt(aluBits.W))
     val b = Input(SInt(aluBits.W))
     val y = Output(SInt(aluBits.W))
   })

   // FIXME: the following three will change once we support properly SHR and SHL
   val ub = io.b.asUInt
   val width = log2Ceil(aluBits)
   val m = ~ub(width - 1, 0) + 1.U

   val n = ub(width - 1, 0)
   // opcode - min:0, max:1, add:2, shr:3, shl:4
   val fop = Seq(Mux(io.a < io.b, io.a, io.b), Mux(io.a < io.b, io.b, io.a),
     io.a + io.b, io.a >> n, io.a << m)

   val opmux = Seq.tabulate(ALU_OP_NUM)(i => ALU_OP(i) -> fop(i))
   io.y := MuxLookup(io.opcode, io.a, opmux)
 }

 /** Pipelined ALU */
 class AluReg(implicit p: Parameters) extends Module {
   val io = IO(new Bundle {
     val opcode = Input(UInt(C_ALU_OP_BITS.W))
     val a = Flipped(ValidIO(UInt(p(CoreKey).accBits.W)))
     val b = Flipped(ValidIO(UInt(p(CoreKey).accBits.W)))
     val y = ValidIO(UInt(p(CoreKey).accBits.W))
   })
   val alu = Module(new Alu)
   val rA = RegEnable(io.a.bits, io.a.valid)
   val rB = RegEnable(io.b.bits, io.b.valid)
   val valid = RegNext(io.b.valid)

   alu.io.opcode := io.opcode

   // register input
   alu.io.a := rA.asSInt
   alu.io.b := rB.asSInt

   // output
   io.y.valid := valid
   io.y.bits := alu.io.y.asUInt
 }

 /** Vector of pipeline ALUs */
 class AluVector(implicit p: Parameters) extends Module {
   val io = IO(new Bundle {
     val opcode = Input(UInt(C_ALU_OP_BITS.W))
     val acc_a = new TensorMasterData(tensorType = "acc")
     val acc_b = new TensorMasterData(tensorType = "acc")
     val acc_y = new TensorClientData(tensorType = "acc")
     val out = new TensorClientData(tensorType = "out")
   })
   val blockOut = p(CoreKey).blockOut
   val f = Seq.fill(blockOut)(Module(new AluReg))
   val valid = Wire(Vec(blockOut, Bool()))
   for (i <- 0 until blockOut) {
     f(i).io.opcode := io.opcode
     f(i).io.a.valid := io.acc_a.data.valid
     f(i).io.a.bits := io.acc_a.data.bits(0)(i)
     f(i).io.b.valid := io.acc_b.data.valid
     f(i).io.b.bits := io.acc_b.data.bits(0)(i)
     valid(i) := f(i).io.y.valid
     io.acc_y.data.bits(0)(i) := f(i).io.y.bits
     io.out.data.bits(0)(i) := f(i).io.y.bits
   }
   io.acc_y.data.valid := valid.asUInt.andR
   io.out.data.valid := valid.asUInt.andR
 }

 class TensorAluIndexGenerator(debug: Boolean = false)(implicit p: Parameters) extends Module {
   val cnt_o_width = (new AluDecode).lp_0.getWidth
   val cnt_i_width = (new AluDecode).lp_1.getWidth

   val io = IO(new Bundle {
     val start = Input(Bool())
     val last = Output(Bool())
     val dec = Input(new AluDecode)
     val valid = Output(Bool())
     val src_valid = Output(Bool())
     val dst_idx = Output(UInt(new TensorParams(tensorType="acc").memAddrBits.W))
     val src_idx = Output(UInt(new TensorParams(tensorType="acc").memAddrBits.W))
     val uop_idx = Output(UInt(log2Ceil(p(CoreKey).uopMemDepth).W))
     val cnt_o = Output(UInt(cnt_o_width.W))
     val cnt_i = Output(UInt(cnt_i_width.W))
   })

   io.last := false.B

   val running = RegInit(false.B)
   val stutter = RegInit(false.B)

   val advance = io.dec.alu_use_imm || stutter

   when(!running && io.start) {
     running := true.B
   } .elsewhen(running && !advance) {
     stutter := true.B
   } .elsewhen(running && advance) {
     when (io.last) {
       running := false.B
     }
     stutter := false.B
   }

   val cnt_i = Reg(chiselTypeOf(io.dec.lp_1))
   val dst_i = Reg(chiselTypeOf(io.dst_idx))
   val src_i = Reg(chiselTypeOf(io.src_idx))

   val cnt_o = Reg(chiselTypeOf(io.dec.lp_0))
   val dst_o = Reg(chiselTypeOf(io.dst_idx))
   val src_o = Reg(chiselTypeOf(io.src_idx))

   val uop_idx = Reg(chiselTypeOf(io.dec.uop_end))

   io.valid := running && advance
   io.src_valid := running && !advance
   io.dst_idx := dst_i
   io.src_idx := src_i
   io.uop_idx := uop_idx
   io.cnt_o := cnt_o
   io.cnt_i := cnt_i

   when(!running) {
     cnt_i := 0.U; dst_i := 0.U; src_i := 0.U;
     cnt_o := 0.U; dst_o := 0.U; src_o := 0.U;
     uop_idx := io.dec.uop_begin
   } .elsewhen (advance) {
     when (uop_idx =/= io.dec.uop_end - 1.U) {
       uop_idx := uop_idx + 1.U
     }.otherwise {
       uop_idx := io.dec.uop_begin
       when (cnt_i =/= io.dec.lp_1 - 1.U) {
         cnt_i := cnt_i + 1.U
         dst_i := dst_i + io.dec.dst_1
         src_i := src_i + io.dec.src_1
       }.otherwise {
         when (cnt_o =/= io.dec.lp_0 - 1.U) {
           val dst_tmp = dst_o + io.dec.dst_0
           val src_tmp = src_o + io.dec.src_0
           cnt_o := cnt_o + 1.U
           dst_o := dst_tmp
           src_o := src_tmp
           cnt_i := 0.U
           dst_i := dst_tmp
           src_i := src_tmp
         } .otherwise {
           io.last := true.B
         }
       }
     }
   }
 }

 class TensorAluIfc(implicit p: Parameters) extends Module {
   val aluBits = p(CoreKey).accBits
   val io = IO(new Bundle {
     val start = Input(Bool())
     val done = Output(Bool())
     val dec = Input(new AluDecode)
     val uop = new UopMaster
     val acc = new TensorMaster(tensorType = "acc")
     val out = new TensorMaster(tensorType = "out")
   })
 }

 class TensorAluPipelined(debug: Boolean = false)(implicit p: Parameters) extends TensorAluIfc {
   val stateBits = 2
   val inflightBits = 4
   val dataSplitFactor = p(CoreKey).blockOutFactor

   val sIdle::sRun::sWait::Nil = Enum(3)
   val state = RegInit(init=sIdle)
   val inflight = RegInit(0.U(inflightBits.W))

   val index_generator = Module(new TensorAluIndexGenerator)
   val aluDataReadPipeDelay = 0 // available for pipelining

   // State Machine for compute io.done correctly
   io.done := false.B
   when(state === sIdle && io.start) {
     state := sRun
   }.elsewhen(state === sRun && index_generator.io.last) {
     state := sWait
   }.elsewhen(state === sWait && inflight === 0.U) {
     state := sIdle
     io.done := true.B
   }

   index_generator.io.start := io.start
   index_generator.io.dec := io.dec

   // second term works around funny clearing in uop register file flopped output
   io.uop.idx.valid := index_generator.io.valid || index_generator.io.src_valid
   io.uop.idx.bits := index_generator.io.uop_idx

   val valid_r1 = ShiftRegister(index_generator.io.valid, aluDataReadPipeDelay + 1, resetData=false.B, en = true.B)
   val valid_r2 = RegNext(valid_r1, init=false.B)
   val valid_r3 = RegNext(valid_r2, init=false.B)
   val valid_r4 = RegNext(valid_r3, init=false.B)

   when(index_generator.io.valid && valid_r4) {
   }.elsewhen(index_generator.io.valid) {
     assert(inflight =/= ((1<<inflightBits)-1).U)
     inflight := inflight + 1.U
   }.elsewhen(valid_r4) {
     assert(inflight =/= 0.U)
     inflight := inflight - 1.U
   }
   when(state === sIdle) {
     assert(inflight === 0.U)
     inflight := 0.U
   }

   val src_valid_r1 = ShiftRegister(
     index_generator.io.src_valid,
     aluDataReadPipeDelay + 1,
     resetData=false.B, en = true.B)
   val src_valid_r2 = RegNext(src_valid_r1, init=false.B)
   val src_valid_r3 = RegNext(src_valid_r2, init=false.B)
   val src_valid_r4 = RegNext(src_valid_r3, init=false.B)

   val dst_idx_r1 = ShiftRegister(index_generator.io.dst_idx, aluDataReadPipeDelay + 1)
   val src_idx_r1 = ShiftRegister(index_generator.io.src_idx, aluDataReadPipeDelay + 1)

   val uop_data_r1 = ShiftRegister(io.uop.data, aluDataReadPipeDelay)

   val dst_offset = uop_data_r1.bits.u0

   val w = dst_offset.getWidth
   val u2 = uop_data_r1.bits.u2.asTypeOf(UInt(w.W))
   val s = log2Ceil(p(CoreKey).inpMemDepth)
   val u1 = uop_data_r1.bits.u1.asTypeOf(UInt(w.W))
   val src_offset = (u2 << s) | u1

   // split registers of stage 2 by data groups
   val accRdIdxValid = valid_r1 || src_valid_r1
   for (idx <- 0 until dataSplitFactor) {
     io.acc.rd(idx).idx.valid := RegNext(accRdIdxValid)
   }

   val new_src_idx_r1 = src_idx_r1 + src_offset
   val src_idx_r2 = RegNext(new_src_idx_r1)
   val src_idx_r3 = RegNext(src_idx_r2)

   val new_dst_idx_r1 = dst_idx_r1 + dst_offset
   val dst_idx_r2 = RegNext(new_dst_idx_r1)
   val dst_idx_r3 = RegNext(dst_idx_r2)
   val dst_idx_r4 = RegNext(dst_idx_r3)

   // split registers of stage 2 by data groups
   val accRdIdxBits = Mux(src_valid_r1 || io.dec.alu_use_imm, new_src_idx_r1, new_dst_idx_r1)
   for (idx <- 0 until dataSplitFactor) {
     io.acc.rd(idx).idx.bits := RegNext(accRdIdxBits)
     assert(io.acc.rd(idx).data.valid === (valid_r3 || src_valid_r3))
   }

   require(io.out.splitWidth == 1 && io.out.splitLength == 1, "-F- Out split write is not supported")
   val numVecUnits = dataSplitFactor
   val outData = Wire(chiselTypeOf(io.out.wr(0).bits.data))
   val dataRemapB = Wire(Vec(numVecUnits, chiselTypeOf(io.acc.rd(0).data.bits)))
   val dataRemapA = Wire(Vec(numVecUnits, chiselTypeOf(io.acc.rd(0).data.bits)))
   // numVecUnits is a pow of 2
   // split dec bits pipe further if there are many vecUnits
   val decSplitNb0 =  if (numVecUnits < 8) 1 else 2
   val decSplit0 = Wire(Vec(decSplitNb0, chiselTypeOf(io.dec)))
   for (idx <- 0 until decSplitNb0) {
     decSplit0(idx) := ShiftRegister(io.dec, if(aluDataReadPipeDelay < 2) 0 else 1)
   }

   for (idx <- 0 until numVecUnits) {
     val alu = Module(new AluVector)

     for(aluLenIdx <- 0 until alu.io.acc_b.lenSplit) {
       for(aluWdtIdx <- 0 until alu.io.acc_b.widthSplit) {
         val (accGrpIdx, accLenIdx, accWdtIdx) =
           alu.io.acc_b.reindexDataFromGroup(idx, aluLenIdx, aluWdtIdx)
         dataRemapB(idx)(aluLenIdx)(aluWdtIdx) :=
           io.acc.rd(accGrpIdx).data.bits(accLenIdx)(accWdtIdx)
       }
     }
     val save_src = RegNext(dataRemapB(idx))
     val tensorImm = Wire(new TensorClientData(tensorType = "acc"))
     tensorImm.data.valid := valid_r3
     val tensorImmBits_piped = ShiftRegister(
       decSplit0(idx/(numVecUnits/decSplitNb0)).alu_imm,
       if(aluDataReadPipeDelay < 2) aluDataReadPipeDelay else aluDataReadPipeDelay -1)
     tensorImm.data.bits.foreach { b =>
       b.foreach { c =>
         c := Mux(tensorImmBits_piped(C_ALU_IMM_BITS - 1),
           Cat(-1.S((aluBits - C_ALU_IMM_BITS).W), tensorImmBits_piped), tensorImmBits_piped)
       }
     }

     // alu
     val tensorOpBits_piped = ShiftRegister(
     decSplit0(idx/(numVecUnits/decSplitNb0)).alu_op,
     if(aluDataReadPipeDelay < 2) aluDataReadPipeDelay else aluDataReadPipeDelay -1)
     val isSHR = (tensorOpBits_piped === ALU_OP(3))
     val neg_shift = isSHR & tensorImmBits_piped(C_ALU_IMM_BITS - 1)
     val fixme_alu_op = Mux(
       neg_shift,
       ALU_OP(4), // use opcode = 4 for left shift
       tensorOpBits_piped)
     alu.io.opcode := fixme_alu_op

     assert(!valid_r3 || io.acc.rd(idx).data.valid)

     alu.io.acc_a.data.valid := RegNext(valid_r2) // valid_r3 split

     for(aluLenIdx <- 0 until alu.io.acc_a.lenSplit) {
       for(aluWdtIdx <- 0 until alu.io.acc_a.widthSplit) {
         val (accGrpIdx, accLenIdx, accWdtIdx) =
           alu.io.acc_a.reindexDataFromGroup(idx, aluLenIdx, aluWdtIdx)
         dataRemapA(idx)(aluLenIdx)(aluWdtIdx) :=
           io.acc.rd(accGrpIdx).data.bits(accLenIdx)(accWdtIdx)
         alu.io.acc_a.data.bits := dataRemapA(idx)
       }
     }
     val tensorUseImmBits_piped = ShiftRegister(
     decSplit0(idx/(numVecUnits/decSplitNb0)).alu_use_imm,
     if(aluDataReadPipeDelay < 2) aluDataReadPipeDelay else aluDataReadPipeDelay -1)
     alu.io.acc_b.data.valid := Mux(tensorUseImmBits_piped,
       tensorImm.data.valid,
       valid_r3)
     alu.io.acc_b.data.bits := Mux(tensorUseImmBits_piped,
       tensorImm.data.bits,
       save_src)

     assert(alu.io.acc_y.data.valid === valid_r4)
     io.acc.wr(idx).valid := valid_r4
     io.acc.wr(idx).bits.idx := dst_idx_r4

     for(aluLenIdx <- 0 until alu.io.acc_y.lenSplit) {
       for(aluWdtIdx <- 0 until alu.io.acc_y.widthSplit) {
         val (accGrpIdx, accLenIdx, accWdtIdx) =
           alu.io.acc_y.reindexDataFromGroup(idx, aluLenIdx, aluWdtIdx)
         io.acc.wr(accGrpIdx).bits.data(accLenIdx)(accWdtIdx) :=
           alu.io.acc_y.data.bits(aluLenIdx)(aluWdtIdx)
       }
     }

     assert(alu.io.out.data.valid === valid_r4)
     for (idx1 <- 0 until io.out.tensorLength) {
       for (idx2 <- 0 until io.out.tensorWidth/numVecUnits) {
         outData(idx1)(idx*io.out.tensorWidth/numVecUnits + idx2) := alu.io.out.data.bits(idx1)(idx2)
       }
     }
   }

 // comment for split write
   io.out.wr(0).valid := valid_r4
   io.out.wr(0).bits.idx := dst_idx_r4
   io.out.wr(0).bits.data := outData
   io.out.tieoffRead()

   val bypass_dst = valid_r3 && valid_r4 && (dst_idx_r4 === dst_idx_r3)
   val bypass_src = src_valid_r3 && valid_r4 && (dst_idx_r4 === src_idx_r3)

   // Do we need a bypass
   assert(!bypass_dst, s"Bypass required on dst_idx read $dst_idx_r3 RAW with write $dst_idx_r4\n")
   assert(!bypass_src, s"Bypass required on src_idx read $src_idx_r3 RAW with write $dst_idx_r4\n")
 }

 /** TensorAluOrig.
  * This unit instantiate the ALU vector unit (AluVector) and go over the
  * micro-ops (uops) which are used to read the source operands (vectors)
  * from the acc-scratchpad and then they are written back the same
  * acc-scratchpad.
  */
 class TensorAluOrig(debug: Boolean = false)(implicit p: Parameters) extends TensorAluIfc {
   val sIdle :: sReadUop :: sComputeIdx :: sReadTensorA :: sReadTensorB :: sExe :: Nil =
     Enum(6)
   val state = RegInit(sIdle)
   val alu = Module(new AluVector)
   val dec = io.dec
   val uop_idx = Reg(chiselTypeOf(dec.uop_end))
   val uop_end = dec.uop_end
   val uop_dst = Reg(chiselTypeOf(io.uop.data.bits.u0)) // width can address entire acc
   val uop_src = Reg(chiselTypeOf(io.uop.data.bits.u0)) // width can address entire acc
   val cnt_o = Reg(chiselTypeOf(dec.lp_0))
   val dst_o = Reg(chiselTypeOf(io.uop.data.bits.u0))
   val src_o = Reg(chiselTypeOf(io.uop.data.bits.u0))
   val cnt_i = Reg(chiselTypeOf(dec.lp_1))
   val dst_i = Reg(chiselTypeOf(io.uop.data.bits.u0))
   val src_i = Reg(chiselTypeOf(io.uop.data.bits.u0))
   val done =
     state === sExe &
       alu.io.out.data.valid &
       (cnt_o === dec.lp_0 - 1.U) &
       (cnt_i === dec.lp_1 - 1.U) &
       (uop_idx === uop_end - 1.U)

   switch(state) {
     is(sIdle) {
       when(io.start) {
         state := sReadUop
       }
     }
     is(sReadUop) {
       state := sComputeIdx
     }
     is(sComputeIdx) {
       state := sReadTensorA
     }
     is(sReadTensorA) {
       state := sReadTensorB
     }
     is(sReadTensorB) {
       state := sExe
     }
     is(sExe) {
       when(alu.io.out.data.valid) {
         when(
           (cnt_o === dec.lp_0 - 1.U) &&
             (cnt_i === dec.lp_1 - 1.U) &&
             (uop_idx === uop_end - 1.U)) {
           state := sIdle
         }.otherwise {
           state := sReadUop
         }
       }
     }
   }

   when(
     state === sIdle ||
       (state === sExe &&
         alu.io.out.data.valid &&
         uop_idx === uop_end - 1.U)) {
     uop_idx := dec.uop_begin
   }.elsewhen(state === sExe && alu.io.out.data.valid) {
     uop_idx := uop_idx + 1.U
   }

   when(state === sIdle) {
     cnt_o := 0.U
     dst_o := 0.U
     src_o := 0.U
   }.elsewhen(
     state === sExe &&
       alu.io.out.data.valid &&
       uop_idx === uop_end - 1.U &&
       cnt_i === dec.lp_1 - 1.U) {
     cnt_o := cnt_o + 1.U
     dst_o := dst_o + dec.dst_0
     src_o := src_o + dec.src_0
   }

   when(state === sIdle) {
     cnt_i := 0.U
     dst_i := 0.U
     src_i := 0.U
   }.elsewhen(state === sReadUop && cnt_i === dec.lp_1) {
     cnt_i := 0.U
     dst_i := dst_o
     src_i := src_o
   }.elsewhen(state === sExe && alu.io.out.data.valid && uop_idx === uop_end - 1.U) {
     cnt_i := cnt_i + 1.U
     dst_i := dst_i + dec.dst_1
     src_i := src_i + dec.src_1
   }

   when(state === sComputeIdx && io.uop.data.valid) {
     uop_dst := io.uop.data.bits.u0 + dst_i
     uop_src := ((io.uop.data.bits.u2.asTypeOf(UInt(width = uop_dst.getWidth.W)) << log2Ceil(p(CoreKey).inpMemDepth))
       | io.uop.data.bits.u1.asTypeOf(UInt(width = uop_dst.getWidth.W))) + src_i
   }

   // uop
   io.uop.idx.valid := state === sReadUop
   io.uop.idx.bits := uop_idx

   val dataSplitFactor = p(CoreKey).blockOutFactor

   val accRdValid = state === sReadTensorA | (state === sReadTensorB & ~dec.alu_use_imm)
   val accRdIdx = Mux(state === sReadTensorA, uop_dst, uop_src)
   for (idx <- 0 until dataSplitFactor) {
     // acc (input)
     io.acc.rd(idx).idx.valid := accRdValid
     io.acc.rd(idx).idx.bits := accRdIdx

     // imm
     val tensorImm = Wire(new TensorClientData(tensorType = "acc"))
     tensorImm.data.valid := state === sReadTensorB
     tensorImm.data.bits.foreach { b =>
       b.foreach { c =>
         c := Mux(dec.alu_imm(C_ALU_IMM_BITS - 1),
           Cat(-1.S((aluBits - C_ALU_IMM_BITS).W), dec.alu_imm), dec.alu_imm)
       }
     }

     // alu
     val isSHR = (dec.alu_op === ALU_OP(3))
     val isSHL = isSHR & dec.alu_imm(C_ALU_IMM_BITS - 1)
     // opcode - min:0, max:1, add:2, shr:3, shl:4
     val fixme_alu_op = Cat(isSHL, Mux(isSHL, 0.U, dec.alu_op(1, 0)))
     alu.io.opcode := fixme_alu_op
     alu.io.acc_a.data.valid := io.acc.rd(idx).data.valid & state === sReadTensorB
     alu.io.acc_a.data.bits <> io.acc.rd(idx).data.bits
     alu.io.acc_b.data.valid := Mux(dec.alu_use_imm,
       tensorImm.data.valid,
       io.acc.rd(idx).data.valid & state === sExe)
     alu.io.acc_b.data.bits <> Mux(dec.alu_use_imm,
       tensorImm.data.bits,
       io.acc.rd(idx).data.bits)

     // acc (output)
     io.acc.wr(idx).valid := alu.io.acc_y.data.valid
     io.acc.wr(idx).bits.idx := uop_dst
     io.acc.wr(idx).bits.data <> alu.io.acc_y.data.bits

     // out
     io.out.wr(idx).valid := alu.io.out.data.valid
     io.out.wr(idx).bits.idx := uop_dst
     io.out.wr(idx).bits.data <> alu.io.out.data.bits
   }
   io.out.tieoffRead() // write-only
   io.done := done

   if (debug) {
     when(state === sReadUop) {
       printf("[TensorAlu] [uop] idx:%x\n", uop_idx)
     }

     when(state === sReadTensorA) {
       printf("[TensorAlu] [uop] dst:%x src:%x\n", uop_dst, uop_src)
     }

     when(state === sIdle && io.start) {
       printf(p"[TensorAlu] decode:$dec\n")
     }

     alu.io.acc_a.data.bits.foreach { tensor =>
       tensor.zipWithIndex.foreach {
         case (elem, i) =>
           when(alu.io.acc_a.data.valid) {
             printf("[TensorAlu] [a] i:%x val:%x\n", i.U, elem)
           }
       }
     }

     alu.io.acc_b.data.bits.foreach { tensor =>
       tensor.zipWithIndex.foreach {
         case (elem, i) =>
           when(alu.io.acc_b.data.valid) {
             printf("[TensorAlu] [b] i:%x val:%x\n", i.U, elem)
           }
       }
     }

     alu.io.acc_y.data.bits.foreach { tensor =>
       tensor.zipWithIndex.foreach {
         case (elem, i) =>
           when(alu.io.acc_y.data.valid) {
             printf("[TensorAlu] [y] i:%x val:%x\n", i.U, elem)
           }
       }
     }

     alu.io.out.data.bits.foreach { tensor =>
       tensor.zipWithIndex.foreach {
         case (elem, i) =>
           when(alu.io.out.data.valid) {
             printf("[TensorAlu] [out] i:%x val:%x\n", i.U, elem)
           }
       }
     }
   }
 }

 class TensorAlu(debug: Boolean = false)(implicit p: Parameters) extends TensorAluPipelined(debug)
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package vta.core

	import chisel3._
	import chisel3.util._
	import vta.util.config._

	/** ALU datapath */
	class Alu(implicit p: Parameters) extends Module {
	val aluBits = p(CoreKey).accBits
	val io = IO(new Bundle {
	val opcode = Input(UInt(C_ALU_OP_BITS.W))
	val a = Input(SInt(aluBits.W))
	val b = Input(SInt(aluBits.W))
	val y = Output(SInt(aluBits.W))
	})

	// FIXME: the following three will change once we support properly SHR and SHL
	val ub = io.b.asUInt
	val width = log2Ceil(aluBits)
	val m = ~ub(width - 1, 0) + 1.U

	val n = ub(width - 1, 0)
	// opcode - min:0, max:1, add:2, shr:3, shl:4
	val fop = Seq(Mux(io.a < io.b, io.a, io.b), Mux(io.a < io.b, io.b, io.a),
	io.a + io.b, io.a >> n, io.a << m)

	val opmux = Seq.tabulate(ALU_OP_NUM)(i => ALU_OP(i) -> fop(i))
	io.y := MuxLookup(io.opcode, io.a, opmux)
	}

	/** Pipelined ALU */
	class AluReg(implicit p: Parameters) extends Module {
	val io = IO(new Bundle {
	val opcode = Input(UInt(C_ALU_OP_BITS.W))
	val a = Flipped(ValidIO(UInt(p(CoreKey).accBits.W)))
	val b = Flipped(ValidIO(UInt(p(CoreKey).accBits.W)))
	val y = ValidIO(UInt(p(CoreKey).accBits.W))
	})
	val alu = Module(new Alu)
	val rA = RegEnable(io.a.bits, io.a.valid)
	val rB = RegEnable(io.b.bits, io.b.valid)
	val valid = RegNext(io.b.valid)

	alu.io.opcode := io.opcode

	// register input
	alu.io.a := rA.asSInt
	alu.io.b := rB.asSInt

	// output
	io.y.valid := valid
	io.y.bits := alu.io.y.asUInt
	}

	/** Vector of pipeline ALUs */
	class AluVector(implicit p: Parameters) extends Module {
	val io = IO(new Bundle {
	val opcode = Input(UInt(C_ALU_OP_BITS.W))
	val acc_a = new TensorMasterData(tensorType = "acc")
	val acc_b = new TensorMasterData(tensorType = "acc")
	val acc_y = new TensorClientData(tensorType = "acc")
	val out = new TensorClientData(tensorType = "out")
	})
	val blockOut = p(CoreKey).blockOut
	val f = Seq.fill(blockOut)(Module(new AluReg))
	val valid = Wire(Vec(blockOut, Bool()))
	for (i <- 0 until blockOut) {
	f(i).io.opcode := io.opcode
	f(i).io.a.valid := io.acc_a.data.valid
	f(i).io.a.bits := io.acc_a.data.bits(0)(i)
	f(i).io.b.valid := io.acc_b.data.valid
	f(i).io.b.bits := io.acc_b.data.bits(0)(i)
	valid(i) := f(i).io.y.valid
	io.acc_y.data.bits(0)(i) := f(i).io.y.bits
	io.out.data.bits(0)(i) := f(i).io.y.bits
	}
	io.acc_y.data.valid := valid.asUInt.andR
	io.out.data.valid := valid.asUInt.andR
	}

	class TensorAluIndexGenerator(debug: Boolean = false)(implicit p: Parameters) extends Module {
	val cnt_o_width = (new AluDecode).lp_0.getWidth
	val cnt_i_width = (new AluDecode).lp_1.getWidth

	val io = IO(new Bundle {
	val start = Input(Bool())
	val last = Output(Bool())
	val dec = Input(new AluDecode)
	val valid = Output(Bool())
	val src_valid = Output(Bool())
	val dst_idx = Output(UInt(new TensorParams(tensorType="acc").memAddrBits.W))
	val src_idx = Output(UInt(new TensorParams(tensorType="acc").memAddrBits.W))
	val uop_idx = Output(UInt(log2Ceil(p(CoreKey).uopMemDepth).W))
	val cnt_o = Output(UInt(cnt_o_width.W))
	val cnt_i = Output(UInt(cnt_i_width.W))
	})

	io.last := false.B

	val running = RegInit(false.B)
	val stutter = RegInit(false.B)

	val advance = io.dec.alu_use_imm \|\| stutter

	when(!running && io.start) {
	running := true.B
	} .elsewhen(running && !advance) {
	stutter := true.B
	} .elsewhen(running && advance) {
	when (io.last) {
	running := false.B
	}
	stutter := false.B
	}

	val cnt_i = Reg(chiselTypeOf(io.dec.lp_1))
	val dst_i = Reg(chiselTypeOf(io.dst_idx))
	val src_i = Reg(chiselTypeOf(io.src_idx))

	val cnt_o = Reg(chiselTypeOf(io.dec.lp_0))
	val dst_o = Reg(chiselTypeOf(io.dst_idx))
	val src_o = Reg(chiselTypeOf(io.src_idx))

	val uop_idx = Reg(chiselTypeOf(io.dec.uop_end))

	io.valid := running && advance
	io.src_valid := running && !advance
	io.dst_idx := dst_i
	io.src_idx := src_i
	io.uop_idx := uop_idx
	io.cnt_o := cnt_o
	io.cnt_i := cnt_i

	when(!running) {
	cnt_i := 0.U; dst_i := 0.U; src_i := 0.U;
	cnt_o := 0.U; dst_o := 0.U; src_o := 0.U;
	uop_idx := io.dec.uop_begin
	} .elsewhen (advance) {
	when (uop_idx =/= io.dec.uop_end - 1.U) {
	uop_idx := uop_idx + 1.U
	}.otherwise {
	uop_idx := io.dec.uop_begin
	when (cnt_i =/= io.dec.lp_1 - 1.U) {
	cnt_i := cnt_i + 1.U
	dst_i := dst_i + io.dec.dst_1
	src_i := src_i + io.dec.src_1
	}.otherwise {
	when (cnt_o =/= io.dec.lp_0 - 1.U) {
	val dst_tmp = dst_o + io.dec.dst_0
	val src_tmp = src_o + io.dec.src_0
	cnt_o := cnt_o + 1.U
	dst_o := dst_tmp
	src_o := src_tmp
	cnt_i := 0.U
	dst_i := dst_tmp
	src_i := src_tmp
	} .otherwise {
	io.last := true.B
	}
	}
	}
	}
	}

	class TensorAluIfc(implicit p: Parameters) extends Module {
	val aluBits = p(CoreKey).accBits
	val io = IO(new Bundle {
	val start = Input(Bool())
	val done = Output(Bool())
	val dec = Input(new AluDecode)
	val uop = new UopMaster
	val acc = new TensorMaster(tensorType = "acc")
	val out = new TensorMaster(tensorType = "out")
	})
	}

	class TensorAluPipelined(debug: Boolean = false)(implicit p: Parameters) extends TensorAluIfc {
	val stateBits = 2
	val inflightBits = 4
	val dataSplitFactor = p(CoreKey).blockOutFactor

	val sIdle::sRun::sWait::Nil = Enum(3)
	val state = RegInit(init=sIdle)
	val inflight = RegInit(0.U(inflightBits.W))

	val index_generator = Module(new TensorAluIndexGenerator)
	val aluDataReadPipeDelay = 0 // available for pipelining

	// State Machine for compute io.done correctly
	io.done := false.B
	when(state === sIdle && io.start) {
	state := sRun
	}.elsewhen(state === sRun && index_generator.io.last) {
	state := sWait
	}.elsewhen(state === sWait && inflight === 0.U) {
	state := sIdle
	io.done := true.B
	}

	index_generator.io.start := io.start
	index_generator.io.dec := io.dec

	// second term works around funny clearing in uop register file flopped output
	io.uop.idx.valid := index_generator.io.valid \|\| index_generator.io.src_valid
	io.uop.idx.bits := index_generator.io.uop_idx

	val valid_r1 = ShiftRegister(index_generator.io.valid, aluDataReadPipeDelay + 1, resetData=false.B, en = true.B)
	val valid_r2 = RegNext(valid_r1, init=false.B)
	val valid_r3 = RegNext(valid_r2, init=false.B)
	val valid_r4 = RegNext(valid_r3, init=false.B)

	when(index_generator.io.valid && valid_r4) {
	}.elsewhen(index_generator.io.valid) {
	assert(inflight =/= ((1<<inflightBits)-1).U)
	inflight := inflight + 1.U
	}.elsewhen(valid_r4) {
	assert(inflight =/= 0.U)
	inflight := inflight - 1.U
	}
	when(state === sIdle) {
	assert(inflight === 0.U)
	inflight := 0.U
	}

	val src_valid_r1 = ShiftRegister(
	index_generator.io.src_valid,
	aluDataReadPipeDelay + 1,
	resetData=false.B, en = true.B)
	val src_valid_r2 = RegNext(src_valid_r1, init=false.B)
	val src_valid_r3 = RegNext(src_valid_r2, init=false.B)
	val src_valid_r4 = RegNext(src_valid_r3, init=false.B)

	val dst_idx_r1 = ShiftRegister(index_generator.io.dst_idx, aluDataReadPipeDelay + 1)
	val src_idx_r1 = ShiftRegister(index_generator.io.src_idx, aluDataReadPipeDelay + 1)

	val uop_data_r1 = ShiftRegister(io.uop.data, aluDataReadPipeDelay)

	val dst_offset = uop_data_r1.bits.u0

	val w = dst_offset.getWidth
	val u2 = uop_data_r1.bits.u2.asTypeOf(UInt(w.W))
	val s = log2Ceil(p(CoreKey).inpMemDepth)
	val u1 = uop_data_r1.bits.u1.asTypeOf(UInt(w.W))
	val src_offset = (u2 << s) \| u1

	// split registers of stage 2 by data groups
	val accRdIdxValid = valid_r1 \|\| src_valid_r1
	for (idx <- 0 until dataSplitFactor) {
	io.acc.rd(idx).idx.valid := RegNext(accRdIdxValid)
	}

	val new_src_idx_r1 = src_idx_r1 + src_offset
	val src_idx_r2 = RegNext(new_src_idx_r1)
	val src_idx_r3 = RegNext(src_idx_r2)

	val new_dst_idx_r1 = dst_idx_r1 + dst_offset
	val dst_idx_r2 = RegNext(new_dst_idx_r1)
	val dst_idx_r3 = RegNext(dst_idx_r2)
	val dst_idx_r4 = RegNext(dst_idx_r3)

	// split registers of stage 2 by data groups
	val accRdIdxBits = Mux(src_valid_r1 \|\| io.dec.alu_use_imm, new_src_idx_r1, new_dst_idx_r1)
	for (idx <- 0 until dataSplitFactor) {
	io.acc.rd(idx).idx.bits := RegNext(accRdIdxBits)
	assert(io.acc.rd(idx).data.valid === (valid_r3 \|\| src_valid_r3))
	}

	require(io.out.splitWidth == 1 && io.out.splitLength == 1, "-F- Out split write is not supported")
	val numVecUnits = dataSplitFactor
	val outData = Wire(chiselTypeOf(io.out.wr(0).bits.data))
	val dataRemapB = Wire(Vec(numVecUnits, chiselTypeOf(io.acc.rd(0).data.bits)))
	val dataRemapA = Wire(Vec(numVecUnits, chiselTypeOf(io.acc.rd(0).data.bits)))
	// numVecUnits is a pow of 2
	// split dec bits pipe further if there are many vecUnits
	val decSplitNb0 = if (numVecUnits < 8) 1 else 2
	val decSplit0 = Wire(Vec(decSplitNb0, chiselTypeOf(io.dec)))
	for (idx <- 0 until decSplitNb0) {
	decSplit0(idx) := ShiftRegister(io.dec, if(aluDataReadPipeDelay < 2) 0 else 1)
	}

	for (idx <- 0 until numVecUnits) {
	val alu = Module(new AluVector)

	for(aluLenIdx <- 0 until alu.io.acc_b.lenSplit) {
	for(aluWdtIdx <- 0 until alu.io.acc_b.widthSplit) {
	val (accGrpIdx, accLenIdx, accWdtIdx) =
	alu.io.acc_b.reindexDataFromGroup(idx, aluLenIdx, aluWdtIdx)
	dataRemapB(idx)(aluLenIdx)(aluWdtIdx) :=
	io.acc.rd(accGrpIdx).data.bits(accLenIdx)(accWdtIdx)
	}
	}
	val save_src = RegNext(dataRemapB(idx))
	val tensorImm = Wire(new TensorClientData(tensorType = "acc"))
	tensorImm.data.valid := valid_r3
	val tensorImmBits_piped = ShiftRegister(
	decSplit0(idx/(numVecUnits/decSplitNb0)).alu_imm,
	if(aluDataReadPipeDelay < 2) aluDataReadPipeDelay else aluDataReadPipeDelay -1)
	tensorImm.data.bits.foreach { b =>
	b.foreach { c =>
	c := Mux(tensorImmBits_piped(C_ALU_IMM_BITS - 1),
	Cat(-1.S((aluBits - C_ALU_IMM_BITS).W), tensorImmBits_piped), tensorImmBits_piped)
	}
	}

	// alu
	val tensorOpBits_piped = ShiftRegister(
	decSplit0(idx/(numVecUnits/decSplitNb0)).alu_op,
	if(aluDataReadPipeDelay < 2) aluDataReadPipeDelay else aluDataReadPipeDelay -1)
	val isSHR = (tensorOpBits_piped === ALU_OP(3))
	val neg_shift = isSHR & tensorImmBits_piped(C_ALU_IMM_BITS - 1)
	val fixme_alu_op = Mux(
	neg_shift,
	ALU_OP(4), // use opcode = 4 for left shift
	tensorOpBits_piped)
	alu.io.opcode := fixme_alu_op

	assert(!valid_r3 \|\| io.acc.rd(idx).data.valid)

	alu.io.acc_a.data.valid := RegNext(valid_r2) // valid_r3 split

	for(aluLenIdx <- 0 until alu.io.acc_a.lenSplit) {
	for(aluWdtIdx <- 0 until alu.io.acc_a.widthSplit) {
	val (accGrpIdx, accLenIdx, accWdtIdx) =
	alu.io.acc_a.reindexDataFromGroup(idx, aluLenIdx, aluWdtIdx)
	dataRemapA(idx)(aluLenIdx)(aluWdtIdx) :=
	io.acc.rd(accGrpIdx).data.bits(accLenIdx)(accWdtIdx)
	alu.io.acc_a.data.bits := dataRemapA(idx)
	}
	}
	val tensorUseImmBits_piped = ShiftRegister(
	decSplit0(idx/(numVecUnits/decSplitNb0)).alu_use_imm,
	if(aluDataReadPipeDelay < 2) aluDataReadPipeDelay else aluDataReadPipeDelay -1)
	alu.io.acc_b.data.valid := Mux(tensorUseImmBits_piped,
	tensorImm.data.valid,
	valid_r3)
	alu.io.acc_b.data.bits := Mux(tensorUseImmBits_piped,
	tensorImm.data.bits,
	save_src)

	assert(alu.io.acc_y.data.valid === valid_r4)
	io.acc.wr(idx).valid := valid_r4
	io.acc.wr(idx).bits.idx := dst_idx_r4

	for(aluLenIdx <- 0 until alu.io.acc_y.lenSplit) {
	for(aluWdtIdx <- 0 until alu.io.acc_y.widthSplit) {
	val (accGrpIdx, accLenIdx, accWdtIdx) =
	alu.io.acc_y.reindexDataFromGroup(idx, aluLenIdx, aluWdtIdx)
	io.acc.wr(accGrpIdx).bits.data(accLenIdx)(accWdtIdx) :=
	alu.io.acc_y.data.bits(aluLenIdx)(aluWdtIdx)
	}
	}

	assert(alu.io.out.data.valid === valid_r4)
	for (idx1 <- 0 until io.out.tensorLength) {
	for (idx2 <- 0 until io.out.tensorWidth/numVecUnits) {
	outData(idx1)(idx*io.out.tensorWidth/numVecUnits + idx2) := alu.io.out.data.bits(idx1)(idx2)
	}
	}
	}

	// comment for split write
	io.out.wr(0).valid := valid_r4
	io.out.wr(0).bits.idx := dst_idx_r4
	io.out.wr(0).bits.data := outData
	io.out.tieoffRead()

	val bypass_dst = valid_r3 && valid_r4 && (dst_idx_r4 === dst_idx_r3)
	val bypass_src = src_valid_r3 && valid_r4 && (dst_idx_r4 === src_idx_r3)

	// Do we need a bypass
	assert(!bypass_dst, s"Bypass required on dst_idx read $dst_idx_r3 RAW with write $dst_idx_r4\n")
	assert(!bypass_src, s"Bypass required on src_idx read $src_idx_r3 RAW with write $dst_idx_r4\n")
	}

	/** TensorAluOrig.
	* This unit instantiate the ALU vector unit (AluVector) and go over the
	* micro-ops (uops) which are used to read the source operands (vectors)
	* from the acc-scratchpad and then they are written back the same
	* acc-scratchpad.
	*/
	class TensorAluOrig(debug: Boolean = false)(implicit p: Parameters) extends TensorAluIfc {
	val sIdle :: sReadUop :: sComputeIdx :: sReadTensorA :: sReadTensorB :: sExe :: Nil =
	Enum(6)
	val state = RegInit(sIdle)
	val alu = Module(new AluVector)
	val dec = io.dec
	val uop_idx = Reg(chiselTypeOf(dec.uop_end))
	val uop_end = dec.uop_end
	val uop_dst = Reg(chiselTypeOf(io.uop.data.bits.u0)) // width can address entire acc
	val uop_src = Reg(chiselTypeOf(io.uop.data.bits.u0)) // width can address entire acc
	val cnt_o = Reg(chiselTypeOf(dec.lp_0))
	val dst_o = Reg(chiselTypeOf(io.uop.data.bits.u0))
	val src_o = Reg(chiselTypeOf(io.uop.data.bits.u0))
	val cnt_i = Reg(chiselTypeOf(dec.lp_1))
	val dst_i = Reg(chiselTypeOf(io.uop.data.bits.u0))
	val src_i = Reg(chiselTypeOf(io.uop.data.bits.u0))
	val done =
	state === sExe &
	alu.io.out.data.valid &
	(cnt_o === dec.lp_0 - 1.U) &
	(cnt_i === dec.lp_1 - 1.U) &
	(uop_idx === uop_end - 1.U)

	switch(state) {
	is(sIdle) {
	when(io.start) {
	state := sReadUop
	}
	}
	is(sReadUop) {
	state := sComputeIdx
	}
	is(sComputeIdx) {
	state := sReadTensorA
	}
	is(sReadTensorA) {
	state := sReadTensorB
	}
	is(sReadTensorB) {
	state := sExe
	}
	is(sExe) {
	when(alu.io.out.data.valid) {
	when(
	(cnt_o === dec.lp_0 - 1.U) &&
	(cnt_i === dec.lp_1 - 1.U) &&
	(uop_idx === uop_end - 1.U)) {
	state := sIdle
	}.otherwise {
	state := sReadUop
	}
	}
	}
	}

	when(
	state === sIdle \|\|
	(state === sExe &&
	alu.io.out.data.valid &&
	uop_idx === uop_end - 1.U)) {
	uop_idx := dec.uop_begin
	}.elsewhen(state === sExe && alu.io.out.data.valid) {
	uop_idx := uop_idx + 1.U
	}

	when(state === sIdle) {
	cnt_o := 0.U
	dst_o := 0.U
	src_o := 0.U
	}.elsewhen(
	state === sExe &&
	alu.io.out.data.valid &&
	uop_idx === uop_end - 1.U &&
	cnt_i === dec.lp_1 - 1.U) {
	cnt_o := cnt_o + 1.U
	dst_o := dst_o + dec.dst_0
	src_o := src_o + dec.src_0
	}

	when(state === sIdle) {
	cnt_i := 0.U
	dst_i := 0.U
	src_i := 0.U
	}.elsewhen(state === sReadUop && cnt_i === dec.lp_1) {
	cnt_i := 0.U
	dst_i := dst_o
	src_i := src_o
	}.elsewhen(state === sExe && alu.io.out.data.valid && uop_idx === uop_end - 1.U) {
	cnt_i := cnt_i + 1.U
	dst_i := dst_i + dec.dst_1
	src_i := src_i + dec.src_1
	}

	when(state === sComputeIdx && io.uop.data.valid) {
	uop_dst := io.uop.data.bits.u0 + dst_i
	uop_src := ((io.uop.data.bits.u2.asTypeOf(UInt(width = uop_dst.getWidth.W)) << log2Ceil(p(CoreKey).inpMemDepth))
	\| io.uop.data.bits.u1.asTypeOf(UInt(width = uop_dst.getWidth.W))) + src_i
	}

	// uop
	io.uop.idx.valid := state === sReadUop
	io.uop.idx.bits := uop_idx

	val dataSplitFactor = p(CoreKey).blockOutFactor

	val accRdValid = state === sReadTensorA \| (state === sReadTensorB & ~dec.alu_use_imm)
	val accRdIdx = Mux(state === sReadTensorA, uop_dst, uop_src)
	for (idx <- 0 until dataSplitFactor) {
	// acc (input)
	io.acc.rd(idx).idx.valid := accRdValid
	io.acc.rd(idx).idx.bits := accRdIdx

	// imm
	val tensorImm = Wire(new TensorClientData(tensorType = "acc"))
	tensorImm.data.valid := state === sReadTensorB
	tensorImm.data.bits.foreach { b =>
	b.foreach { c =>
	c := Mux(dec.alu_imm(C_ALU_IMM_BITS - 1),
	Cat(-1.S((aluBits - C_ALU_IMM_BITS).W), dec.alu_imm), dec.alu_imm)
	}
	}

	// alu
	val isSHR = (dec.alu_op === ALU_OP(3))
	val isSHL = isSHR & dec.alu_imm(C_ALU_IMM_BITS - 1)
	// opcode - min:0, max:1, add:2, shr:3, shl:4
	val fixme_alu_op = Cat(isSHL, Mux(isSHL, 0.U, dec.alu_op(1, 0)))
	alu.io.opcode := fixme_alu_op
	alu.io.acc_a.data.valid := io.acc.rd(idx).data.valid & state === sReadTensorB
	alu.io.acc_a.data.bits <> io.acc.rd(idx).data.bits
	alu.io.acc_b.data.valid := Mux(dec.alu_use_imm,
	tensorImm.data.valid,
	io.acc.rd(idx).data.valid & state === sExe)
	alu.io.acc_b.data.bits <> Mux(dec.alu_use_imm,
	tensorImm.data.bits,
	io.acc.rd(idx).data.bits)

	// acc (output)
	io.acc.wr(idx).valid := alu.io.acc_y.data.valid
	io.acc.wr(idx).bits.idx := uop_dst
	io.acc.wr(idx).bits.data <> alu.io.acc_y.data.bits

	// out
	io.out.wr(idx).valid := alu.io.out.data.valid
	io.out.wr(idx).bits.idx := uop_dst
	io.out.wr(idx).bits.data <> alu.io.out.data.bits
	}
	io.out.tieoffRead() // write-only
	io.done := done

	if (debug) {
	when(state === sReadUop) {
	printf("[TensorAlu] [uop] idx:%x\n", uop_idx)
	}

	when(state === sReadTensorA) {
	printf("[TensorAlu] [uop] dst:%x src:%x\n", uop_dst, uop_src)
	}

	when(state === sIdle && io.start) {
	printf(p"[TensorAlu] decode:$dec\n")
	}

	alu.io.acc_a.data.bits.foreach { tensor =>
	tensor.zipWithIndex.foreach {
	case (elem, i) =>
	when(alu.io.acc_a.data.valid) {
	printf("[TensorAlu] [a] i:%x val:%x\n", i.U, elem)
	}
	}
	}

	alu.io.acc_b.data.bits.foreach { tensor =>
	tensor.zipWithIndex.foreach {
	case (elem, i) =>
	when(alu.io.acc_b.data.valid) {
	printf("[TensorAlu] [b] i:%x val:%x\n", i.U, elem)
	}
	}
	}

	alu.io.acc_y.data.bits.foreach { tensor =>
	tensor.zipWithIndex.foreach {
	case (elem, i) =>
	when(alu.io.acc_y.data.valid) {
	printf("[TensorAlu] [y] i:%x val:%x\n", i.U, elem)
	}
	}
	}

	alu.io.out.data.bits.foreach { tensor =>
	tensor.zipWithIndex.foreach {
	case (elem, i) =>
	when(alu.io.out.data.valid) {
	printf("[TensorAlu] [out] i:%x val:%x\n", i.U, elem)
	}
	}
	}
	}
	}

	class TensorAlu(debug: Boolean = false)(implicit p: Parameters) extends TensorAluPipelined(debug)