[VTA] [Chisel] make dram offset configurable for uops different than 4-bytes (#3654)

diff --git a/hardware/chisel/src/main/scala/core/Core.scala b/hardware/chisel/src/main/scala/core/Core.scala
index 6c29a88..e63a112 100644
--- a/hardware/chisel/src/main/scala/core/Core.scala
+++ b/hardware/chisel/src/main/scala/core/Core.scala
@@ -40,6 +40,9 @@
   outMemDepth: Int = 512,
   instQueueEntries: Int = 32
 )
+{
+  require (uopBits % 8 == 0, s"\n\n[VTA] [CoreParams] uopBits must be byte aligned\n\n")
+}
 
 case object CoreKey extends Field[CoreParams]
 
diff --git a/hardware/chisel/src/main/scala/core/LoadUop.scala b/hardware/chisel/src/main/scala/core/LoadUop.scala
index 5a9c66f..ab8275b 100644
--- a/hardware/chisel/src/main/scala/core/LoadUop.scala
+++ b/hardware/chisel/src/main/scala/core/LoadUop.scala
@@ -69,6 +69,7 @@
   })
   val numUop = 2 // store two uops per sram word
   val uopBits = p(CoreKey).uopBits
+  val uopBytes = uopBits / 8
   val uopDepth = p(CoreKey).uopMemDepth / numUop
 
   val dec = io.inst.asTypeOf(new MemDecode)
@@ -129,7 +130,7 @@
     when (offsetIsEven) {
       raddr := io.baddr + dec.dram_offset
     } .otherwise {
-      raddr := io.baddr + dec.dram_offset - 4.U
+      raddr := io.baddr + dec.dram_offset - uopBytes.U
     }
   } .elsewhen (state === sReadData && xcnt === xlen && xrem =/= 0.U) {
     raddr := raddr + xmax_bytes