hardware/chisel/src/main/scala/core/TensorAlu.scala - tvm-vta - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package vta.core

 import chisel3._
 import chisel3.util._
 import vta.util.config._

 /** ALU datapath */
 class Alu(implicit p: Parameters) extends Module {
   val aluBits = p(CoreKey).accBits
   val io = IO(new Bundle {
     val opcode = Input(UInt(C_ALU_OP_BITS.W))
     val a = Input(SInt(aluBits.W))
     val b = Input(SInt(aluBits.W))
     val y = Output(SInt(aluBits.W))
   })

   // FIXME: the following three will change once we support properly SHR and SHL
   val ub = io.b.asUInt
   val width = log2Ceil(aluBits)
   val m = ~ub(width - 1, 0) + 1.U

   val n = ub(width - 1, 0)
   // opcode - min:0, max:1, add:2, shr:3, shl:4
   val fop = Seq(Mux(io.a < io.b, io.a, io.b), Mux(io.a < io.b, io.b, io.a),
     io.a + io.b, io.a >> n, io.a << m)

   val opmux = Seq.tabulate(ALU_OP_NUM)(i => ALU_OP(i) -> fop(i))
   io.y := MuxLookup(io.opcode, io.a, opmux)
 }

 /** Pipelined ALU */
 class AluReg(implicit p: Parameters) extends Module {
   val io = IO(new Bundle {
     val opcode = Input(UInt(C_ALU_OP_BITS.W))
     val a = Flipped(ValidIO(UInt(p(CoreKey).accBits.W)))
     val b = Flipped(ValidIO(UInt(p(CoreKey).accBits.W)))
     val y = ValidIO(UInt(p(CoreKey).accBits.W))
   })
   val alu = Module(new Alu)
   val rA = RegEnable(io.a.bits, io.a.valid)
   val rB = RegEnable(io.b.bits, io.b.valid)
   val valid = RegNext(io.b.valid)

   alu.io.opcode := io.opcode

   // register input
   alu.io.a := rA.asSInt
   alu.io.b := rB.asSInt

   // output
   io.y.valid := valid
   io.y.bits := alu.io.y.asUInt
 }

 /** Vector of pipeline ALUs */
 class AluVector(implicit p: Parameters) extends Module {
   val io = IO(new Bundle {
     val opcode = Input(UInt(C_ALU_OP_BITS.W))
     val acc_a = new TensorMasterData(tensorType = "acc")
     val acc_b = new TensorMasterData(tensorType = "acc")
     val acc_y = new TensorClientData(tensorType = "acc")
     val out = new TensorClientData(tensorType = "out")
   })
   val blockOut = p(CoreKey).blockOut
   val f = Seq.fill(blockOut)(Module(new AluReg))
   val valid = Wire(Vec(blockOut, Bool()))
   for (i <- 0 until blockOut) {
     f(i).io.opcode := io.opcode
     f(i).io.a.valid := io.acc_a.data.valid
     f(i).io.a.bits := io.acc_a.data.bits(0)(i)
     f(i).io.b.valid := io.acc_b.data.valid
     f(i).io.b.bits := io.acc_b.data.bits(0)(i)
     valid(i) := f(i).io.y.valid
     io.acc_y.data.bits(0)(i) := f(i).io.y.bits
     io.out.data.bits(0)(i) := f(i).io.y.bits
   }
   io.acc_y.data.valid := valid.asUInt.andR
   io.out.data.valid := valid.asUInt.andR
 }

 /** TensorAlu.
  *
  * This unit instantiate the ALU vector unit (AluVector) and go over the
  * micro-ops (uops) which are used to read the source operands (vectors)
  * from the acc-scratchpad and then they are written back the same
  * acc-scratchpad.
  */
 class TensorAlu(debug: Boolean = false)(implicit p: Parameters) extends Module {
   val aluBits = p(CoreKey).accBits
   val io = IO(new Bundle {
     val start = Input(Bool())
     val done = Output(Bool())
     val inst = Input(UInt(INST_BITS.W))
     val uop = new UopMaster
     val acc = new TensorMaster(tensorType = "acc")
     val out = new TensorMaster(tensorType = "out")
   })
   val sIdle :: sReadUop :: sComputeIdx :: sReadTensorA :: sReadTensorB :: sExe :: Nil =
     Enum(6)
   val state = RegInit(sIdle)
   val alu = Module(new AluVector)
   val dec = io.inst.asTypeOf(new AluDecode)
   val uop_idx = Reg(chiselTypeOf(dec.uop_end))
   val uop_end = dec.uop_end
   val uop_dst = Reg(chiselTypeOf(dec.uop_end))
   val uop_src = Reg(chiselTypeOf(dec.uop_end))
   val cnt_o = Reg(chiselTypeOf(dec.lp_0))
   val dst_o = Reg(chiselTypeOf(dec.uop_end))
   val src_o = Reg(chiselTypeOf(dec.uop_end))
   val cnt_i = Reg(chiselTypeOf(dec.lp_1))
   val dst_i = Reg(chiselTypeOf(dec.uop_end))
   val src_i = Reg(chiselTypeOf(dec.uop_end))
   val done =
     state === sExe &
       alu.io.out.data.valid &
       (cnt_o === dec.lp_0 - 1.U) &
       (cnt_i === dec.lp_1 - 1.U) &
       (uop_idx === uop_end - 1.U)

   switch(state) {
     is(sIdle) {
       when(io.start) {
         state := sReadUop
       }
     }
     is(sReadUop) {
       state := sComputeIdx
     }
     is(sComputeIdx) {
       state := sReadTensorA
     }
     is(sReadTensorA) {
       state := sReadTensorB
     }
     is(sReadTensorB) {
       state := sExe
     }
     is(sExe) {
       when(alu.io.out.data.valid) {
         when(
           (cnt_o === dec.lp_0 - 1.U) &&
             (cnt_i === dec.lp_1 - 1.U) &&
             (uop_idx === uop_end - 1.U)) {
           state := sIdle
         }.otherwise {
           state := sReadUop
         }
       }
     }
   }

   when(
     state === sIdle ||
       (state === sExe &&
         alu.io.out.data.valid &&
         uop_idx === uop_end - 1.U)) {
     uop_idx := dec.uop_begin
   }.elsewhen(state === sExe && alu.io.out.data.valid) {
     uop_idx := uop_idx + 1.U
   }

   when(state === sIdle) {
     cnt_o := 0.U
     dst_o := 0.U
     src_o := 0.U
   }.elsewhen(
     state === sExe &&
       alu.io.out.data.valid &&
       uop_idx === uop_end - 1.U &&
       cnt_i === dec.lp_1 - 1.U) {
     cnt_o := cnt_o + 1.U
     dst_o := dst_o + dec.dst_0
     src_o := src_o + dec.src_0
   }

   when(state === sIdle) {
     cnt_i := 0.U
     dst_i := 0.U
     src_i := 0.U
   }.elsewhen(state === sReadUop && cnt_i === dec.lp_1) {
     cnt_i := 0.U
     dst_i := dst_o
     src_i := src_o
   }.elsewhen(state === sExe && alu.io.out.data.valid && uop_idx === uop_end - 1.U) {
     cnt_i := cnt_i + 1.U
     dst_i := dst_i + dec.dst_1
     src_i := src_i + dec.src_1
   }

   when(state === sComputeIdx && io.uop.data.valid) {
     uop_dst := io.uop.data.bits.u0 + dst_i
     uop_src := io.uop.data.bits.u1 + src_i
   }

   // uop
   io.uop.idx.valid := state === sReadUop
   io.uop.idx.bits := uop_idx

   // acc (input)
   io.acc.rd.idx.valid := state === sReadTensorA | (state === sReadTensorB & ~dec.alu_use_imm)
   io.acc.rd.idx.bits := Mux(state === sReadTensorA, uop_dst, uop_src)

   // imm
   val tensorImm = Wire(new TensorClientData(tensorType = "acc"))
   tensorImm.data.valid := state === sReadTensorB
   tensorImm.data.bits.foreach { b =>
     b.foreach { c =>
       c := Mux(dec.alu_imm(C_ALU_IMM_BITS - 1),
         Cat(-1.S((aluBits - C_ALU_IMM_BITS).W), dec.alu_imm), dec.alu_imm)
     }
   }

   // alu
   val isSHR = dec.alu_op === ALU_OP(3)
   val isSHL = isSHR & dec.alu_imm(C_ALU_IMM_BITS - 1)
   // opcode - min:0, max:1, add:2, shr:3, shl:4
   val fixme_alu_op = Cat(isSHL, Mux(isSHL, 0.U, dec.alu_op(1, 0)))
   alu.io.opcode := fixme_alu_op
   alu.io.acc_a.data.valid := io.acc.rd.data.valid & state === sReadTensorB
   alu.io.acc_a.data.bits <> io.acc.rd.data.bits
   alu.io.acc_b.data.valid := Mux(dec.alu_use_imm,
     tensorImm.data.valid,
     io.acc.rd.data.valid & state === sExe)
   alu.io.acc_b.data.bits <> Mux(dec.alu_use_imm,
     tensorImm.data.bits,
     io.acc.rd.data.bits)

   // acc (output)
   io.acc.wr.valid := alu.io.acc_y.data.valid
   io.acc.wr.bits.idx := uop_dst
   io.acc.wr.bits.data <> alu.io.acc_y.data.bits

   // out
   io.out.wr.valid := alu.io.out.data.valid
   io.out.wr.bits.idx := uop_dst
   io.out.wr.bits.data <> alu.io.out.data.bits
   io.out.tieoffRead() // write-only

   io.done := done

   if (debug) {

     when(state === sReadUop) {
       printf("[TensorAlu] [uop] idx:%x\n", uop_idx)
     }

     when(state === sReadTensorA) {
       printf("[TensorAlu] [uop] dst:%x src:%x\n", uop_dst, uop_src)
     }

     when(state === sIdle && io.start) {
       printf(p"[TensorAlu] decode:$dec\n")
     }

     alu.io.acc_a.data.bits.foreach { tensor =>
       tensor.zipWithIndex.foreach {
         case (elem, i) =>
           when(alu.io.acc_a.data.valid) {
             printf("[TensorAlu] [a] i:%x val:%x\n", i.U, elem)
           }
       }
     }

     alu.io.acc_b.data.bits.foreach { tensor =>
       tensor.zipWithIndex.foreach {
         case (elem, i) =>
           when(alu.io.acc_b.data.valid) {
             printf("[TensorAlu] [b] i:%x val:%x\n", i.U, elem)
           }
       }
     }

     alu.io.acc_y.data.bits.foreach { tensor =>
       tensor.zipWithIndex.foreach {
         case (elem, i) =>
           when(alu.io.acc_y.data.valid) {
             printf("[TensorAlu] [y] i:%x val:%x\n", i.U, elem)
           }
       }
     }

     alu.io.out.data.bits.foreach { tensor =>
       tensor.zipWithIndex.foreach {
         case (elem, i) =>
           when(alu.io.out.data.valid) {
             printf("[TensorAlu] [out] i:%x val:%x\n", i.U, elem)
           }
       }
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package vta.core

	import chisel3._
	import chisel3.util._
	import vta.util.config._

	/** ALU datapath */
	class Alu(implicit p: Parameters) extends Module {
	val aluBits = p(CoreKey).accBits
	val io = IO(new Bundle {
	val opcode = Input(UInt(C_ALU_OP_BITS.W))
	val a = Input(SInt(aluBits.W))
	val b = Input(SInt(aluBits.W))
	val y = Output(SInt(aluBits.W))
	})

	// FIXME: the following three will change once we support properly SHR and SHL
	val ub = io.b.asUInt
	val width = log2Ceil(aluBits)
	val m = ~ub(width - 1, 0) + 1.U

	val n = ub(width - 1, 0)
	// opcode - min:0, max:1, add:2, shr:3, shl:4
	val fop = Seq(Mux(io.a < io.b, io.a, io.b), Mux(io.a < io.b, io.b, io.a),
	io.a + io.b, io.a >> n, io.a << m)

	val opmux = Seq.tabulate(ALU_OP_NUM)(i => ALU_OP(i) -> fop(i))
	io.y := MuxLookup(io.opcode, io.a, opmux)
	}

	/** Pipelined ALU */
	class AluReg(implicit p: Parameters) extends Module {
	val io = IO(new Bundle {
	val opcode = Input(UInt(C_ALU_OP_BITS.W))
	val a = Flipped(ValidIO(UInt(p(CoreKey).accBits.W)))
	val b = Flipped(ValidIO(UInt(p(CoreKey).accBits.W)))
	val y = ValidIO(UInt(p(CoreKey).accBits.W))
	})
	val alu = Module(new Alu)
	val rA = RegEnable(io.a.bits, io.a.valid)
	val rB = RegEnable(io.b.bits, io.b.valid)
	val valid = RegNext(io.b.valid)

	alu.io.opcode := io.opcode

	// register input
	alu.io.a := rA.asSInt
	alu.io.b := rB.asSInt

	// output
	io.y.valid := valid
	io.y.bits := alu.io.y.asUInt
	}

	/** Vector of pipeline ALUs */
	class AluVector(implicit p: Parameters) extends Module {
	val io = IO(new Bundle {
	val opcode = Input(UInt(C_ALU_OP_BITS.W))
	val acc_a = new TensorMasterData(tensorType = "acc")
	val acc_b = new TensorMasterData(tensorType = "acc")
	val acc_y = new TensorClientData(tensorType = "acc")
	val out = new TensorClientData(tensorType = "out")
	})
	val blockOut = p(CoreKey).blockOut
	val f = Seq.fill(blockOut)(Module(new AluReg))
	val valid = Wire(Vec(blockOut, Bool()))
	for (i <- 0 until blockOut) {
	f(i).io.opcode := io.opcode
	f(i).io.a.valid := io.acc_a.data.valid
	f(i).io.a.bits := io.acc_a.data.bits(0)(i)
	f(i).io.b.valid := io.acc_b.data.valid
	f(i).io.b.bits := io.acc_b.data.bits(0)(i)
	valid(i) := f(i).io.y.valid
	io.acc_y.data.bits(0)(i) := f(i).io.y.bits
	io.out.data.bits(0)(i) := f(i).io.y.bits
	}
	io.acc_y.data.valid := valid.asUInt.andR
	io.out.data.valid := valid.asUInt.andR
	}

	/** TensorAlu.
	*
	* This unit instantiate the ALU vector unit (AluVector) and go over the
	* micro-ops (uops) which are used to read the source operands (vectors)
	* from the acc-scratchpad and then they are written back the same
	* acc-scratchpad.
	*/
	class TensorAlu(debug: Boolean = false)(implicit p: Parameters) extends Module {
	val aluBits = p(CoreKey).accBits
	val io = IO(new Bundle {
	val start = Input(Bool())
	val done = Output(Bool())
	val inst = Input(UInt(INST_BITS.W))
	val uop = new UopMaster
	val acc = new TensorMaster(tensorType = "acc")
	val out = new TensorMaster(tensorType = "out")
	})
	val sIdle :: sReadUop :: sComputeIdx :: sReadTensorA :: sReadTensorB :: sExe :: Nil =
	Enum(6)
	val state = RegInit(sIdle)
	val alu = Module(new AluVector)
	val dec = io.inst.asTypeOf(new AluDecode)
	val uop_idx = Reg(chiselTypeOf(dec.uop_end))
	val uop_end = dec.uop_end
	val uop_dst = Reg(chiselTypeOf(dec.uop_end))
	val uop_src = Reg(chiselTypeOf(dec.uop_end))
	val cnt_o = Reg(chiselTypeOf(dec.lp_0))
	val dst_o = Reg(chiselTypeOf(dec.uop_end))
	val src_o = Reg(chiselTypeOf(dec.uop_end))
	val cnt_i = Reg(chiselTypeOf(dec.lp_1))
	val dst_i = Reg(chiselTypeOf(dec.uop_end))
	val src_i = Reg(chiselTypeOf(dec.uop_end))
	val done =
	state === sExe &
	alu.io.out.data.valid &
	(cnt_o === dec.lp_0 - 1.U) &
	(cnt_i === dec.lp_1 - 1.U) &
	(uop_idx === uop_end - 1.U)

	switch(state) {
	is(sIdle) {
	when(io.start) {
	state := sReadUop
	}
	}
	is(sReadUop) {
	state := sComputeIdx
	}
	is(sComputeIdx) {
	state := sReadTensorA
	}
	is(sReadTensorA) {
	state := sReadTensorB
	}
	is(sReadTensorB) {
	state := sExe
	}
	is(sExe) {
	when(alu.io.out.data.valid) {
	when(
	(cnt_o === dec.lp_0 - 1.U) &&
	(cnt_i === dec.lp_1 - 1.U) &&
	(uop_idx === uop_end - 1.U)) {
	state := sIdle
	}.otherwise {
	state := sReadUop
	}
	}
	}
	}

	when(
	state === sIdle \|\|
	(state === sExe &&
	alu.io.out.data.valid &&
	uop_idx === uop_end - 1.U)) {
	uop_idx := dec.uop_begin
	}.elsewhen(state === sExe && alu.io.out.data.valid) {
	uop_idx := uop_idx + 1.U
	}

	when(state === sIdle) {
	cnt_o := 0.U
	dst_o := 0.U
	src_o := 0.U
	}.elsewhen(
	state === sExe &&
	alu.io.out.data.valid &&
	uop_idx === uop_end - 1.U &&
	cnt_i === dec.lp_1 - 1.U) {
	cnt_o := cnt_o + 1.U
	dst_o := dst_o + dec.dst_0
	src_o := src_o + dec.src_0
	}

	when(state === sIdle) {
	cnt_i := 0.U
	dst_i := 0.U
	src_i := 0.U
	}.elsewhen(state === sReadUop && cnt_i === dec.lp_1) {
	cnt_i := 0.U
	dst_i := dst_o
	src_i := src_o
	}.elsewhen(state === sExe && alu.io.out.data.valid && uop_idx === uop_end - 1.U) {
	cnt_i := cnt_i + 1.U
	dst_i := dst_i + dec.dst_1
	src_i := src_i + dec.src_1
	}

	when(state === sComputeIdx && io.uop.data.valid) {
	uop_dst := io.uop.data.bits.u0 + dst_i
	uop_src := io.uop.data.bits.u1 + src_i
	}

	// uop
	io.uop.idx.valid := state === sReadUop
	io.uop.idx.bits := uop_idx

	// acc (input)
	io.acc.rd.idx.valid := state === sReadTensorA \| (state === sReadTensorB & ~dec.alu_use_imm)
	io.acc.rd.idx.bits := Mux(state === sReadTensorA, uop_dst, uop_src)

	// imm
	val tensorImm = Wire(new TensorClientData(tensorType = "acc"))
	tensorImm.data.valid := state === sReadTensorB
	tensorImm.data.bits.foreach { b =>
	b.foreach { c =>
	c := Mux(dec.alu_imm(C_ALU_IMM_BITS - 1),
	Cat(-1.S((aluBits - C_ALU_IMM_BITS).W), dec.alu_imm), dec.alu_imm)
	}
	}

	// alu
	val isSHR = dec.alu_op === ALU_OP(3)
	val isSHL = isSHR & dec.alu_imm(C_ALU_IMM_BITS - 1)
	// opcode - min:0, max:1, add:2, shr:3, shl:4
	val fixme_alu_op = Cat(isSHL, Mux(isSHL, 0.U, dec.alu_op(1, 0)))
	alu.io.opcode := fixme_alu_op
	alu.io.acc_a.data.valid := io.acc.rd.data.valid & state === sReadTensorB
	alu.io.acc_a.data.bits <> io.acc.rd.data.bits
	alu.io.acc_b.data.valid := Mux(dec.alu_use_imm,
	tensorImm.data.valid,
	io.acc.rd.data.valid & state === sExe)
	alu.io.acc_b.data.bits <> Mux(dec.alu_use_imm,
	tensorImm.data.bits,
	io.acc.rd.data.bits)

	// acc (output)
	io.acc.wr.valid := alu.io.acc_y.data.valid
	io.acc.wr.bits.idx := uop_dst
	io.acc.wr.bits.data <> alu.io.acc_y.data.bits

	// out
	io.out.wr.valid := alu.io.out.data.valid
	io.out.wr.bits.idx := uop_dst
	io.out.wr.bits.data <> alu.io.out.data.bits
	io.out.tieoffRead() // write-only

	io.done := done

	if (debug) {

	when(state === sReadUop) {
	printf("[TensorAlu] [uop] idx:%x\n", uop_idx)
	}

	when(state === sReadTensorA) {
	printf("[TensorAlu] [uop] dst:%x src:%x\n", uop_dst, uop_src)
	}

	when(state === sIdle && io.start) {
	printf(p"[TensorAlu] decode:$dec\n")
	}

	alu.io.acc_a.data.bits.foreach { tensor =>
	tensor.zipWithIndex.foreach {
	case (elem, i) =>
	when(alu.io.acc_a.data.valid) {
	printf("[TensorAlu] [a] i:%x val:%x\n", i.U, elem)
	}
	}
	}

	alu.io.acc_b.data.bits.foreach { tensor =>
	tensor.zipWithIndex.foreach {
	case (elem, i) =>
	when(alu.io.acc_b.data.valid) {
	printf("[TensorAlu] [b] i:%x val:%x\n", i.U, elem)
	}
	}
	}

	alu.io.acc_y.data.bits.foreach { tensor =>
	tensor.zipWithIndex.foreach {
	case (elem, i) =>
	when(alu.io.acc_y.data.valid) {
	printf("[TensorAlu] [y] i:%x val:%x\n", i.U, elem)
	}
	}
	}

	alu.io.out.data.bits.foreach { tensor =>
	tensor.zipWithIndex.foreach {
	case (elem, i) =>
	when(alu.io.out.data.valid) {
	printf("[TensorAlu] [out] i:%x val:%x\n", i.U, elem)
	}
	}
	}
	}
	}