| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package vta.core |
| |
| import scala.math.pow |
| import scala.math.sqrt |
| |
| import chisel3._ |
| import chisel3.util._ |
| import vta.util.config._ |
| import vta.shell._ |
| |
| |
| /** TensorLoad. |
| * |
| * Load 1D and 2D tensors from main memory (DRAM) to input/weight |
| * scratchpads (SRAM). Also, there is support for zero padding, while |
| * doing the load. |
| */ |
| class TensorLoadNarrowVME(tensorType: String = "none", debug: Boolean = false)( |
| implicit p: Parameters) |
| extends Module { |
| val tp = new TensorParams(tensorType) |
| val mp = p(ShellKey).memParams |
| val io = IO(new Bundle { |
| val start = Input(Bool()) |
| val done = Output(Bool()) |
| val inst = Input(UInt(INST_BITS.W)) |
| val baddr = Input(UInt(mp.addrBits.W)) |
| val vme_rd = new VMEReadMaster |
| val tensor = new TensorClient(tensorType) |
| }) |
| val writePipeLatency = tp.writePipeLatency |
| |
| val sIdle :: sBusy :: Nil = |
| Enum(2) |
| val state = RegInit(sIdle) |
| |
| val isBusy = state === sBusy |
| |
| val localDone = Wire(Bool()) |
| when(io.start) { |
| state := sBusy |
| }.elsewhen(localDone) { |
| state := sIdle |
| } |
| |
| val dec = io.inst.asTypeOf(new MemDecode) |
| |
| val vmeDataBitsPipe = RegNext(io.vme_rd.data.bits) |
| val vmeDataValidPipe = RegNext(io.vme_rd.data.valid, init = false.B) |
| val vmeDataReadyPipe = RegNext(io.vme_rd.data.ready, init = false.B) |
| val vmeDataFirePipe = vmeDataValidPipe & vmeDataReadyPipe |
| |
| //-------------------------------------- |
| //--- Generate data load VME command --- |
| //-------------------------------------- |
| val vmeCmd = Module (new GenVMECmd(tensorType, debug)) |
| vmeCmd.io.start := io.start |
| vmeCmd.io.isBusy := isBusy |
| vmeCmd.io.inst := io.inst |
| vmeCmd.io.baddr := io.baddr |
| vmeCmd.io.vmeCmd <> io.vme_rd.cmd |
| val readLen = vmeCmd.io.readLen |
| val commandsDone = vmeCmd.io.done |
| |
| // count how many blocks not received |
| val blkIdxWdth = log2Ceil(tp.tsSizeRatio * tp.memDepth) // the size of scratchpad in blocks |
| // Nb of data blocks requestd, not received. TODO: smaller width parameter |
| val blocksInFlight = Reg(UInt(blkIdxWdth.W)) |
| when(io.start) { |
| blocksInFlight := 0.U |
| }.elsewhen(isBusy && io.vme_rd.cmd.fire && !vmeDataFirePipe) { |
| blocksInFlight := blocksInFlight + readLen |
| }.elsewhen(isBusy && io.vme_rd.cmd.fire && vmeDataFirePipe) { |
| blocksInFlight := blocksInFlight + readLen - 1.U |
| }.elsewhen(isBusy && !io.vme_rd.cmd.fire && vmeDataFirePipe) { |
| assert(blocksInFlight > 0.U) |
| blocksInFlight := blocksInFlight - 1.U |
| }.otherwise { |
| blocksInFlight := blocksInFlight |
| } |
| |
| //--------------------- |
| //--- Read VME data --- |
| //--------------------- |
| |
| val readData = Module(new ReadVMEData(tensorType, debug)) |
| readData.io.start := io.start |
| readData.io.vmeData.valid := vmeDataValidPipe |
| readData.io.vmeData.bits := vmeDataBitsPipe |
| assert(!readData.io.vmeData.valid || readData.io.vmeData.ready, |
| "-F- Expecting const ready. Fix ReadVMEData to receive data 1 cyce after ready") |
| io.vme_rd.data.ready := readData.io.vmeData.ready |
| val rdDataDestCol = readData.io.col // this is an index of a col in tensor |
| val rdDataDestIdx = readData.io.idx // this is an index of a tensor |
| |
| //------------------------- |
| //--- Fill zero padding --- |
| //------------------------- |
| |
| val fillPadding = Module(new ZeroPadding(tensorType, debug)) |
| fillPadding.io.canWriteMem := !vmeDataFirePipe |
| fillPadding.io.inst := RegNext(io.inst) // stage it to move from instr queue |
| fillPadding.io.start := RegNext(io.start, init = false.B)// stage it to move from instr que |
| |
| val isZeroPadWrite = fillPadding.io.tensorIdx.valid // Store zero filled tensor, zpDestIdx is valid |
| val zpDestIdx = fillPadding.io.tensorIdx.bits // Tensor index |
| val paddingDone = fillPadding.io.done |
| |
| //-------------------- |
| //--- Write memory --- |
| //-------------------- |
| |
| val memSizeRatio = tp.tsSizeRatio |
| val splitDataFactor = tp.splitWidth * tp.splitLength |
| val splitMemBlockFactor = if (splitDataFactor > memSizeRatio) { |
| require((splitDataFactor/memSizeRatio) * memSizeRatio == splitDataFactor, |
| "-F- Cannot split tensor data memBlockBits further.") |
| splitDataFactor/memSizeRatio |
| }else { |
| 1 |
| } |
| val groupMemBlockFactor = if (splitDataFactor > memSizeRatio) { |
| 1 |
| }else { |
| require((memSizeRatio/splitDataFactor) * splitDataFactor == memSizeRatio, |
| "-F- Cannot group tensor data memBlockBits into groups.") |
| memSizeRatio/splitDataFactor |
| } |
| // one macro has a VME memory read bit width or read/write group bit width |
| //different groups can read/write scratchpad separately |
| val tensorFile = Seq.fill(memSizeRatio * splitMemBlockFactor |
| ) { |
| SyncReadMem(tp.memDepth, UInt((tp.memBlockBits/splitMemBlockFactor).W)) |
| } |
| |
| |
| require(splitDataFactor * groupMemBlockFactor == memSizeRatio * splitMemBlockFactor, |
| "-F- Wrong split of data") |
| //------------------------------- |
| //--- Write address vector ------ |
| //------------------------------- |
| // split data to build pipe tree |
| val splitFactorL0 = pow(2,log2Ceil(memSizeRatio) / 2).toInt |
| val splitFactorL1 = pow(2,log2Ceil(memSizeRatio) - log2Ceil(memSizeRatio) / 2).toInt |
| require(splitFactorL0 * splitFactorL1 == memSizeRatio) |
| // tensor load instruction writes a VME data block or a whole tensor |
| val waddrTensInstrTmp = Mux(isZeroPadWrite, zpDestIdx, rdDataDestIdx) |
| val waddrTensInstrPipe = VecInit((for (j <- 0 until splitFactorL1) yield { |
| ShiftRegister(waddrTensInstrTmp, if (writePipeLatency > 0) 1 else 0) |
| }).flatMap(elem => for (k <- 0 until splitFactorL0) yield { |
| elem |
| }).flatMap(elem => for (k <- 0 until splitMemBlockFactor) yield { |
| ShiftRegister(elem, if (writePipeLatency < 2) 0 else writePipeLatency - 1) |
| })) |
| require(waddrTensInstrPipe.size == memSizeRatio * splitMemBlockFactor) |
| |
| val waddrDirect = (VecInit((for (grIdx <- 0 until splitDataFactor) yield { |
| io.tensor.wr(grIdx).bits.idx |
| }).flatMap(elem => for (k <- 0 until groupMemBlockFactor) yield {elem}))).asTypeOf( |
| Vec(memSizeRatio * splitMemBlockFactor, io.tensor.wr(0).bits.idx.cloneType) |
| ) |
| |
| |
| val waddr = Wire(Vec(memSizeRatio * splitMemBlockFactor, waddrTensInstrTmp.cloneType)) |
| for (j <- 0 until memSizeRatio * splitMemBlockFactor) { |
| waddr(j) := Mux( |
| ShiftRegister(state === sIdle, writePipeLatency, resetData = true.B, en = true.B), |
| waddrDirect(j), |
| waddrTensInstrPipe(j)) |
| } |
| |
| //------------------------------- |
| //--- Write enable vector ------- |
| //------------------------------- |
| val dataOffset = rdDataDestCol |
| // get en sygnal and duplicate |
| val wenTensInstr = VecInit((for (j <- 0 until memSizeRatio) yield { |
| Mux(isZeroPadWrite, true.B, dataOffset === j.U && vmeDataFirePipe) |
| }).flatMap(elem => for (k <- 0 until splitMemBlockFactor) yield {elem})) |
| |
| val wenDirect = VecInit((for (grIdx <- 0 until splitDataFactor) yield { |
| io.tensor.wr(grIdx).valid |
| }).flatMap(elem => for (k <- 0 until groupMemBlockFactor) yield {elem})) |
| |
| val wen = Wire(Vec(memSizeRatio * splitMemBlockFactor, Bool())) |
| for (j <- 0 until memSizeRatio * splitMemBlockFactor) { |
| wen(j) := Mux( |
| ShiftRegister(state === sIdle, writePipeLatency, resetData = true.B, en = true.B), |
| wenDirect(j), |
| ShiftRegister(wenTensInstr(j), writePipeLatency)) |
| } |
| |
| require(tp.memBlockBits % tp.tensorElemBits == 0) |
| |
| |
| //------------------------------- |
| //--- Write data vector --------- |
| //------------------------------- |
| val wdataTensInstrDataPipe = VecInit((for (j <- 0 until splitFactorL0) yield { |
| ShiftRegister(vmeDataBitsPipe.data, if (writePipeLatency > 0) 1 else 0) |
| }).flatMap(elem => for (k <- 0 until splitFactorL1) yield { |
| elem |
| }).flatMap(elem => for (k <- 0 until splitMemBlockFactor) yield { |
| require(elem.getWidth == tp.memBlockBits) |
| ShiftRegister( |
| elem.asTypeOf(Vec(splitMemBlockFactor, UInt((tp.memBlockBits/splitMemBlockFactor).W)))(k), |
| if (writePipeLatency < 2) 0 else writePipeLatency - 1) |
| })) |
| require(wdataTensInstrDataPipe.size == memSizeRatio * splitMemBlockFactor) |
| val wdataTensInstr = Wire(Vec(memSizeRatio * splitMemBlockFactor, UInt((tp.memBlockBits/splitMemBlockFactor).W))) |
| for (j <- 0 until memSizeRatio * splitMemBlockFactor) { |
| // pipe 1 stage paddingControl per group |
| val padValue = 0.U |
| |
| wdataTensInstr(j) := Mux( |
| ShiftRegister(isZeroPadWrite, writePipeLatency, resetData = false.B, en = true.B), |
| ShiftRegister(padValue /* a single group total data bits */, writePipeLatency), |
| wdataTensInstrDataPipe(j)) |
| } |
| |
| // THIS wdataDirect writes continous scratchpad data space |
| // It is WRONG for ACC batch > 1 |
| // maps group data bits to continous sequence of mem blocks |
| // but wr(x).bits.data is a window in a tensor |
| val wdataDirect = VecInit((for (grIdx <- 0 until splitDataFactor) yield { |
| io.tensor.wr(grIdx).bits.data |
| }).flatMap(elem => for (k <- 0 until groupMemBlockFactor) yield { |
| elem.asTypeOf(Vec(groupMemBlockFactor, UInt((tp.memBlockBits/splitMemBlockFactor).W)))(k) |
| })) |
| val wdata = Wire(Vec(memSizeRatio * splitMemBlockFactor, UInt((tp.memBlockBits/splitMemBlockFactor).W))) |
| for (j <- 0 until memSizeRatio * splitMemBlockFactor) { |
| wdata(j) := Mux( |
| ShiftRegister(state === sIdle, writePipeLatency, resetData = true.B, en = true.B), |
| wdataDirect(j), |
| wdataTensInstr(j)) |
| } |
| |
| for (j <- 0 until memSizeRatio * splitMemBlockFactor) { |
| when(wen(j)) { |
| tensorFile(j).write(waddr(j), wdata(j)) |
| } |
| } |
| if (debug) { |
| when(isZeroPadWrite) { |
| printf(s"[TensorLoad] $tensorType isZeroPadWrite data zpDestIdx:%d\n", |
| zpDestIdx) |
| } |
| when (vmeDataFirePipe) { |
| printf(s"[TensorLoad] $tensorType data rdDataDestCol:%d rdDataDestIdx:%d\n", |
| rdDataDestCol, |
| rdDataDestIdx) |
| } |
| } |
| |
| // read-from-sram |
| for (grIdx <- 0 until splitDataFactor) { |
| val rvalid = ShiftRegister( |
| io.tensor.rd(grIdx).idx.valid, tp.readTensorLatency + 1, resetData = false.B, en = true.B) |
| io.tensor.rd(grIdx).data.valid := rvalid |
| } |
| |
| val memsInGroup = memSizeRatio * splitMemBlockFactor / splitDataFactor |
| for (grIdx <- 0 until splitDataFactor) { |
| io.tensor.rd(grIdx).data.bits := |
| VecInit(for (memBlkIdx <- 0 until memsInGroup) yield { |
| tensorFile(grIdx * memsInGroup + memBlkIdx).read( |
| ShiftRegister(io.tensor.rd(grIdx).idx.bits, tp.readTensorLatency), |
| ShiftRegister(io.tensor.rd(grIdx).idx.valid, tp.readTensorLatency, resetData = false.B, en = true.B)) |
| }).asTypeOf(io.tensor.rd(grIdx).data.bits) |
| } |
| |
| // done |
| val loadDone = blocksInFlight === 0.U && commandsDone && state === sBusy |
| localDone := loadDone && paddingDone |
| io.done := ShiftRegister(localDone, writePipeLatency, resetData = false.B, en = true.B) |
| |
| } |
| |
| //------------------------- |
| //--- Fill zero padding --- |
| //------------------------- |
| |
| //---------------------------------------------------------------------------- |
| // Fill tensors with zeros if padding is defined |
| // stride must be used (xstride and ysize) if xpad_0 or xpad_1 |
| // are not zero and matrix has more than one row of tensors |
| // zp states enumerate different types of padding blocks |
| // TOP - width = dec.xpad_0 + dec.xstride + dec.xpad_1; height = dec.ypad_0 |
| // LEFT - width = dec.xpad_0; height = dec.ysize |
| // RIGHT - width = dec.xpad_1; height = dec.ysize |
| // BOT - width = dec.xpad_0 + dec.xstride + dec.xpad_1; height = dec.ypad_1 |
| // BOTH - LEFT+RIGHT |
| // SKIP - dec.xpad_0 == 0 && dec.xpad_1 |
| |
| //Fill algorithm fills row by row from TOP then sides, then BOT |
| //---------------------------------------------------------------------------- |
| class ZeroPadding(tensorType: String = "none", debug: Boolean = false)( |
| implicit p: Parameters) |
| extends Module { |
| val tp = new TensorParams(tensorType) |
| val mp = p(ShellKey).memParams |
| val io = IO(new Bundle { |
| val canWriteMem = Input(Bool()) |
| val inst = Input(UInt(INST_BITS.W)) |
| val tensorIdx = Output(ValidIO(UInt(tp.memAddrBits.W))) |
| val start = Input(Bool()) |
| val done = Output(Bool()) |
| }) |
| |
| val dec = io.inst.asTypeOf(new MemDecode) |
| |
| val isZeroPadWrite = Wire(Bool()) // Store zero filled tensor, zpDestIdx is valid |
| val zpDestIdx = Wire(dec.sram_offset.cloneType) // Tensor index |
| val sZpIdle :: sZpTop :: sZpSideLeft :: sZpSideRight :: sZpSideBoth :: sZpSideSkip :: sZpBot :: Nil = |
| Enum(7) |
| val zpState = RegInit(sZpIdle) |
| val paddingDone = zpState === sZpIdle // Done filling zero tensors |
| val zpColIdx = Reg(UInt((dec.xpad_0.getWidth + dec.xsize.getWidth + dec.xpad_1.getWidth).W)) |
| val zpNewFillBlock = Wire(Bool()) // separate new fill block <-> inside block row change and column idx calculation |
| // Define padding area iterators |
| val zpRowIdx = Reg(UInt((dec.ypad_0.getWidth + dec.ysize.getWidth + dec.ypad_1.getWidth).W)) // current padding row |
| // current padding column |
| val zpDestRowOffset = Reg(dec.sram_offset.cloneType) // one-dimentional offset for zpRowIdx |
| zpRowIdx := zpRowIdx |
| zpColIdx := zpColIdx |
| zpDestRowOffset := zpDestRowOffset |
| zpNewFillBlock := false.B |
| |
| //state change val |
| val zpLastDataRow = zpRowIdx === dec.ypad_0 + dec.ysize - 1.U |
| val zpTopLastIdx = dec.xpad_0 + dec.xsize + dec.xpad_1 - 1.U // last index of total width |
| val zpWideLineEnd = (zpState === sZpSideBoth || zpState === sZpSideRight) && zpColIdx === zpTopLastIdx |
| val zpNarwLineEnd = zpState === sZpSideLeft && zpColIdx === dec.xpad_0 - 1.U |
| val zpFillLineEnd = zpWideLineEnd || zpNarwLineEnd |
| |
| when(io.start) { |
| zpRowIdx := 0.U |
| zpDestRowOffset := dec.sram_offset |
| |
| zpColIdx := 0.U |
| when(dec.xpad_0 === 0.U && dec.xpad_1 =/= 0.U && dec.ypad_0 === 0.U) { |
| zpColIdx := dec.xpad_0 + dec.xsize |
| } |
| when(dec.ypad_0 =/= 0.U) { |
| zpState := sZpTop |
| }.elsewhen(dec.xpad_0 =/= 0.U && dec.xpad_1 === 0.U) { |
| zpState := sZpSideLeft |
| }.elsewhen(dec.xpad_0 === 0.U && dec.xpad_1 =/= 0.U) { |
| zpState := sZpSideRight |
| }.elsewhen(dec.xpad_0 =/= 0.U && dec.xpad_1 =/= 0.U) { |
| zpState := sZpSideBoth |
| }.elsewhen(dec.ypad_1 =/= 0.U) { |
| zpState := sZpSideSkip |
| }.otherwise { |
| zpState := sZpIdle // nothing to fill |
| } |
| }.elsewhen( |
| io.canWriteMem && |
| zpState === sZpTop && |
| zpRowIdx === dec.ypad_0 - 1.U && /*we know ypad_0 > 0 */ |
| zpColIdx === zpTopLastIdx) { |
| zpNewFillBlock := true.B |
| |
| zpColIdx := 0.U |
| when(dec.xpad_0 === 0.U && dec.xpad_1 =/= 0.U) { |
| zpColIdx := dec.xpad_0 + dec.xsize |
| } |
| when(dec.xpad_0 =/= 0.U && dec.xpad_1 === 0.U) { |
| zpState := sZpSideLeft |
| }.elsewhen(dec.xpad_0 === 0.U && dec.xpad_1 =/= 0.U) { |
| zpState := sZpSideRight |
| }.elsewhen(dec.xpad_0 =/= 0.U && dec.xpad_1 =/= 0.U) { |
| zpState := sZpSideBoth |
| }.elsewhen(dec.ypad_1 =/= 0.U) { |
| zpState := sZpSideSkip |
| }.otherwise { |
| zpState := sZpIdle // nothing to fill |
| } |
| }.elsewhen( |
| zpLastDataRow && // last row before ypad_1 |
| ((zpFillLineEnd && io.canWriteMem) || // last zero tensor in xpad_0 or xpad_1 |
| zpState === sZpSideSkip)) /* no padding in data rows */ { |
| |
| zpNewFillBlock := true.B |
| |
| when(dec.ypad_1 =/= 0.U) { // also no dec.xpad_1 no xpad_0 |
| zpColIdx := 0.U // first index for ypad_1 area |
| zpState := sZpBot // if more padding is needed go to count data rows |
| }.otherwise { |
| zpState := sZpIdle // nothing to fill |
| } |
| }.elsewhen( |
| io.canWriteMem && |
| zpState === sZpBot && |
| zpRowIdx === dec.ypad_0 + dec.ysize + dec.ypad_1 - 1.U && /*we know ypad_1 > 0 */ |
| zpColIdx === zpTopLastIdx) { |
| zpNewFillBlock := true.B |
| |
| zpColIdx := 0.U |
| zpState := sZpIdle |
| }.otherwise { |
| zpState := zpState |
| } |
| // allowed to write memory when data reader is inactive |
| isZeroPadWrite := zpState =/= sZpIdle && zpState =/= sZpSideSkip && io.canWriteMem |
| zpDestIdx := zpDestRowOffset + zpColIdx |
| |
| //increment row |
| // and set zpColIdx on a row change |
| val incrementRow = Wire(Bool()) |
| incrementRow := false.B |
| when( |
| ((((zpState === sZpTop || zpState === sZpSideBoth || zpState === sZpSideRight || zpState === sZpBot) && |
| zpColIdx === zpTopLastIdx) || |
| (zpState === sZpSideLeft && zpColIdx === dec.xpad_0 - 1.U))&& io.canWriteMem) || |
| zpState === sZpSideSkip) { |
| |
| zpDestRowOffset := zpDestRowOffset + zpTopLastIdx + 1.U // count rows in one-dimentional destination matrix |
| zpRowIdx := zpRowIdx + 1.U |
| incrementRow := true.B |
| when(!zpNewFillBlock) { // column may be reset on block type change |
| when(zpState === sZpSideRight) { |
| zpColIdx := dec.xpad_0 + dec.xsize |
| }.otherwise { |
| zpColIdx := 0.U |
| } |
| } |
| } |
| |
| //increment column if it is not done on block change or row in block change |
| when(isZeroPadWrite && !zpNewFillBlock && !incrementRow) { |
| when(zpState === sZpSideBoth && zpColIdx === dec.xpad_0 - 1.U) { |
| zpColIdx := zpColIdx + dec.xsize + 1.U// skip data tensors |
| |
| }.otherwise { |
| zpColIdx := zpColIdx + 1.U |
| } |
| } |
| io.done := zpState === sZpIdle |
| io.tensorIdx.valid := isZeroPadWrite |
| io.tensorIdx.bits := zpDestIdx |
| } |
| |
| //--------------------- |
| //--- Read VME data --- |
| //--------------------- |
| //---------------------------------------------------------------------------- |
| // Read VME data. Generate Memory index and data |
| // transaction TAG is a data block offset in scratchpad |
| // Different transactions are identified by tag change |
| // SAME DESTINATION SUBSEQUENT REQUESTS IN ONE INSTRUCTION LEADS TO UNDEFINED BEHAVIOR |
| //---------------------------------------------------------------------------- |
| class ReadVMEData(tensorType: String = "none", debug: Boolean = false)( |
| implicit p: Parameters) |
| extends Module { |
| val tp = new TensorParams(tensorType) |
| val mp = p(ShellKey).memParams |
| val io = IO(new Bundle { |
| val start = Input(Bool()) |
| val vmeData = Flipped(Decoupled(new VMEData)) |
| val idx = Output(UInt(tp.memAddrBits.W)) |
| val col = Output(UInt(log2Ceil(tp.tsSizeRatio).W)) |
| }) |
| |
| io.vmeData.ready := true.B // always ready to read VME data |
| |
| require(pow(2, log2Ceil(tp.tensorSizeBits)) == tp.tensorSizeBits, |
| "-F- Tensor bit size must be 2^. Using shift and bits to divide.") |
| require(pow(2, log2Ceil(tp.memBlockBits)) == tp.memBlockBits, |
| "-F- Tensor bit size must be 2^. Using shift and bits to divide.") |
| require(tp.tsSizeRatio >= 1, |
| "-F- Tensor bit size must equal or greater than read puls width.") |
| |
| val blkOffsetWidth = log2Ceil(tp.tsSizeRatio) |
| |
| |
| val rdDataDestCol = Wire(UInt(blkOffsetWidth.W)) // this is an index of a cl in a tensor |
| val rdDataDestIdx = Wire(UInt(M_SRAM_OFFSET_BITS.W)) // this is an index of a tensor |
| io.vmeData.ready := true.B // always ready to read VME data |
| |
| //decode data destination |
| val vmeTagDecode = io.vmeData.bits.tag |
| val vmeTagDecodeLast = Reg(vmeTagDecode.cloneType) // store tag to identify a new burst |
| val rdDataIdx = vmeTagDecode(vmeTagDecode.getWidth - 1, blkOffsetWidth) |
| val rdDataCol = if (tp.tsSizeRatio == 1) 0.U else vmeTagDecode(blkOffsetWidth - 1, 0) |
| val rdDataDestColNext = Reg(rdDataDestCol.cloneType) // this is an index in a col in tensor |
| val rdDataDestIdxNext = Reg(UInt(M_SRAM_OFFSET_BITS.W)) // this is an index of a tensor |
| |
| val vmeTagDecodeLastValid = Wire(Bool()) |
| val vmeTagDecodeLastValidNext = RegNext( |
| next = vmeTagDecodeLastValid, |
| init = false.B) |
| when(io.start) { |
| vmeTagDecodeLastValid :=false.B // reset tag valid |
| }.elsewhen(io.vmeData.fire) { |
| vmeTagDecodeLastValid := true.B // set tag valid on a new read |
| }.otherwise { |
| vmeTagDecodeLastValid := vmeTagDecodeLastValidNext // keep value |
| } |
| rdDataDestCol := DontCare |
| rdDataDestIdx := DontCare |
| when(io.vmeData.fire) { |
| when ( |
| !vmeTagDecodeLastValidNext || |
| (vmeTagDecodeLastValidNext && |
| vmeTagDecode.asUInt =/= vmeTagDecodeLast.asUInt)) { |
| |
| vmeTagDecodeLast := vmeTagDecode // a new burst |
| rdDataDestCol := rdDataCol |
| rdDataDestIdx := rdDataIdx |
| rdDataDestColNext := rdDataCol + 1.U //increment col in tensor |
| rdDataDestIdxNext := rdDataIdx |
| }.otherwise { |
| rdDataDestCol := rdDataDestColNext //continue burst read |
| rdDataDestColNext := rdDataDestColNext + 1.U //increment col in tensor |
| rdDataDestIdx := rdDataDestIdxNext |
| when(rdDataDestCol === (tp.tsSizeRatio - 1).U) { |
| rdDataDestIdxNext := rdDataDestIdxNext + 1.U //increment tensor index |
| } |
| } |
| } |
| |
| io.idx := rdDataDestIdx |
| io.col := rdDataDestCol |
| } |
| |
| // transaction TAG is a data block offset in scratchpad |
| // Different transactions are identified by tag change |
| // SAME DESTINATION SUBSEQUENT REQUESTS IN ONE INSTRUCTION LEADS TO UNDEFINED BEHAVIOR |
| class GenVMECmd(tensorType: String = "none", debug: Boolean = false)( |
| implicit p: Parameters) |
| extends Module { |
| val tp = new TensorParams(tensorType) |
| val mp = p(ShellKey).memParams |
| val io = IO(new Bundle { |
| val start = Input(Bool()) |
| val isBusy = Input(Bool()) |
| val inst = Input(UInt(INST_BITS.W)) |
| val baddr = Input(UInt(mp.addrBits.W)) |
| val vmeCmd = Decoupled(new VMECmd) |
| val readLen = Output(UInt((mp.lenBits + 1).W)) |
| val done = Output(Bool()) |
| }) |
| val sizeFactor = tp.tsSizeRatio |
| |
| |
| val dec = io.inst.asTypeOf(new MemDecode) |
| |
| val rdCmdExtAddr = Reg(UInt(mp.addrBits.W)) // current address in the row |
| val maxTransfer = (1 << mp.lenBits).U // max number of blocks in transfer |
| // from old data ctrl |
| val elemBytes = tp.tensorLength * tp.tensorWidth * tp.tensorElemBits / 8 // bytes in tensor |
| val maskOffset = VecInit(Seq.fill(M_DRAM_OFFSET_BITS)(true.B)).asUInt |
| val xfer_init_addr = io.baddr | (maskOffset & (dec.dram_offset << log2Ceil(elemBytes))) |
| val maxTrBytes = maxTransfer << (log2Ceil(mp.dataBits) - 3) |
| //Align first transfer to maxTrBytes boundary. It occures on every dec.xsize transfer |
| //all other transfers in the row will end at maxTrBytes boundary |
| val firstMaxTransfer = (maxTrBytes - rdCmdExtAddr % maxTrBytes) >> (log2Ceil(mp.dataBits) - 3) |
| |
| |
| //-------------------------------------- |
| //--- Generate data load VME command --- |
| //-------------------------------------- |
| |
| val rdCmdStartIdxValid = Wire(Bool()) // Command is valid |
| val startIssueCmdRead = Wire(Bool()) // First transaction in dec.xsize transfer |
| val rdCmdStartIdx = Reg(UInt(log2Ceil(tp.memDepth).W)) // Scratchpad data block index for the first transaction |
| val readLen = Wire(UInt((mp.lenBits + 1).W)) // read cmd transaction length. It is <= maxTransfer |
| val commandsDone = RegInit(true.B) // Done generating VME commands |
| val stride = Wire(Bool()) // flags change to the next row to read |
| val blocksReadSize = (dec.xsize << log2Ceil(sizeFactor)) // how many blocks to read in a singl src row |
| val blocksReadNb = Reg(blocksReadSize.cloneType) |
| val rdCmdExtAddrRowBegin = Reg(UInt(mp.addrBits.W)) // starting address in the row |
| val newReadRow = Reg(Bool()) // flags the first read of dec.xsize |
| |
| // set which source row of data to read. dec.ysize defines the number of rows |
| val srcRowIdx = Reg(UInt(dec.ysize.getWidth.W)) // current row of stride read |
| when (io.start) { |
| srcRowIdx := 0.U // 1st row |
| }.elsewhen (stride) { |
| srcRowIdx := srcRowIdx + 1.U // increment row |
| }.otherwise { |
| srcRowIdx := srcRowIdx // stay in the row |
| } |
| |
| // set how many blocks of data being loaded |
| commandsDone := commandsDone |
| when (io.start || stride) { |
| blocksReadNb := 0.U |
| commandsDone := false.B |
| }.elsewhen (io.vmeCmd.fire) { |
| val nextBlRNb = blocksReadNb + readLen |
| blocksReadNb := nextBlRNb // THIS IS WHEN A NEW VME CMD HAPPENS |
| when (nextBlRNb === blocksReadSize && srcRowIdx === dec.ysize - 1.U) { |
| commandsDone := true.B |
| } |
| }.otherwise { |
| blocksReadNb := blocksReadNb |
| } |
| |
| //when the whole xsize row read commands send, go for the next src row |
| when((blocksReadNb === blocksReadSize - readLen) && (srcRowIdx =/= dec.ysize - 1.U) && io.vmeCmd.fire) { |
| stride := true.B |
| }.otherwise { |
| stride := false.B |
| } |
| |
| assert(!io.isBusy || blocksReadSize >= blocksReadNb)// define how many block to read at this cycle |
| val blocksRemained = blocksReadSize - blocksReadNb |
| when (newReadRow) { |
| when(blocksRemained < firstMaxTransfer) { |
| readLen := blocksRemained |
| }.otherwise { |
| readLen := firstMaxTransfer |
| } |
| }.otherwise { |
| when(blocksRemained < maxTransfer) { |
| readLen := blocksRemained |
| }.otherwise { |
| readLen := maxTransfer |
| } |
| } |
| // block index of the read data row (xsize). Modified by zero padding |
| val totalWidth = dec.xsize + dec.xpad_0 + dec.xpad_1 // width of scratchpad matrix in tensors |
| // instead of multiplying total width by ypad_0 do incremental addition. |
| //Should cost ypad_0 cycles to issue 1st read cmd |
| // counts src matrix with y padding rows of tensors |
| val currentRowIdx = Reg(UInt((dec.ysize.getWidth + dec.ypad_0.getWidth).W)) |
| // start to issue read cmd |
| rdCmdStartIdxValid := currentRowIdx >= dec.ypad_0 && |
| currentRowIdx < (dec.ysize + dec.ypad_0) && |
| io.isBusy && |
| !commandsDone |
| when (io.start) { |
| currentRowIdx := 0.U |
| rdCmdStartIdx := dec.sram_offset + dec.xpad_0 // this index is in tensors |
| }.elsewhen (io.isBusy && (currentRowIdx < dec.ypad_0 || stride)) { |
| rdCmdStartIdx := rdCmdStartIdx + totalWidth |
| currentRowIdx := currentRowIdx + 1.U |
| } |
| startIssueCmdRead := false.B |
| when(blocksReadNb === 0.U && rdCmdStartIdxValid) { |
| startIssueCmdRead := true.B |
| } |
| rdCmdExtAddrRowBegin := rdCmdExtAddrRowBegin |
| |
| when (io.start) { |
| rdCmdExtAddr := xfer_init_addr |
| rdCmdExtAddrRowBegin := xfer_init_addr |
| newReadRow := true.B |
| }.elsewhen (io.vmeCmd.fire) { |
| when(stride) { |
| val memRow = rdCmdExtAddrRowBegin + (dec.xstride << log2Ceil(elemBytes)) |
| rdCmdExtAddr := memRow // go to the next source matrix row with xstride tensors offset |
| rdCmdExtAddrRowBegin := memRow |
| newReadRow := true.B |
| }.otherwise { |
| newReadRow := false.B |
| // go to the next tranaction same continous data block |
| rdCmdExtAddr := rdCmdExtAddr + (readLen << (log2Ceil(mp.dataBits) - 3)) |
| } |
| }.otherwise { |
| rdCmdExtAddr := rdCmdExtAddr |
| newReadRow := newReadRow |
| } |
| |
| //------------------------------------- |
| //--- execute VME data load command --- |
| //------------------------------------- |
| |
| require(pow(2, log2Ceil(tp.tensorSizeBits)) == tp.tensorSizeBits, |
| "-F- Tensor size must be 2^. Using shift and bits to divide.") |
| require(pow(2, log2Ceil(tp.memBlockBits)) == tp.memBlockBits, |
| "-F- Read pulsewidth must be 2^ . Using shift and bits to divide.") |
| //first log2Ceil(tp.numMemBlock) bits encode block offset in a row, |
| //then log2Ceil(tp.tensorLength) bits for a row in a tensor, then tensor index |
| val blkOffset = log2Ceil(tp.tsSizeRatio) |
| val blkIdxWdth = log2Ceil(tp.tsSizeRatio * tp.memDepth) // the size of scratchpad in blocks |
| |
| val rdCmdDestBlockIdx = Wire(UInt(blkIdxWdth.W)) // dataBits size block index in a scratchpad |
| val rdCmdDestBlockIdxNext = Reg(rdCmdDestBlockIdx.cloneType) // dataBits size block index in a scratchpad |
| rdCmdDestBlockIdxNext := rdCmdDestBlockIdxNext |
| rdCmdDestBlockIdx := rdCmdDestBlockIdxNext |
| |
| // block position in a scratchpad |
| val rdCmdValid = Wire(Bool()) |
| //increment scratch pad destination index |
| when(rdCmdStartIdxValid) { |
| rdCmdValid := true.B |
| when(startIssueCmdRead) { |
| rdCmdDestBlockIdx := rdCmdStartIdx << blkOffset // it is aligned by tensor size |
| rdCmdDestBlockIdxNext:= rdCmdDestBlockIdx + readLen |
| }.elsewhen (io.vmeCmd.fire) { |
| // increment block position by transaction length |
| rdCmdDestBlockIdxNext:= rdCmdDestBlockIdxNext + readLen |
| } |
| }.otherwise { |
| rdCmdValid := false.B |
| } |
| if(debug) { |
| when (io.vmeCmd.fire) { |
| printf(s"[GenVMECmd] $tensorType cmd data rdCmdDestBlockIdx:%b " + |
| s" length:%d \n", |
| rdCmdDestBlockIdx, |
| readLen) |
| } |
| } |
| // read-from-dram |
| require(io.vmeCmd.bits.tag.getWidth >= rdCmdDestBlockIdx.getWidth, |
| "-F- Not enough VME tag bits to store transaction tag.") |
| io.vmeCmd.valid := rdCmdValid |
| io.vmeCmd.bits.addr := rdCmdExtAddr |
| io.vmeCmd.bits.len := readLen - 1.U |
| assert(!io.vmeCmd.valid || ((readLen << log2Ceil(mp.dataBits/8)) <= (maxTrBytes - rdCmdExtAddr % maxTrBytes)), |
| s"-F- ${tensorType} DRAM page alignment failure. DRAM " + |
| s"address + len overlaps mp.lenBits*memBlockSize alignment %x %x", |
| rdCmdExtAddr, readLen) |
| io.vmeCmd.bits.tag := rdCmdDestBlockIdx |
| io.readLen := readLen |
| io.done := commandsDone |
| } |