tests/hardware/common/test_lib.cc - tvm-vta - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file test_lib.cpp
  * \brief Test library for the VTA design simulation and driver tests.
  */

 #include "test_lib.h"

 #ifdef NO_SIM
 #ifdef VTA_TARGET_PYNQ

 uint64_t vta(
   uint32_t insn_count,
   VTAGenericInsn *insns,
   VTAUop *uops,
   uint32_t *inputs,
   uint32_t *weights,
   uint32_t *biases,
   uint32_t *outputs) {
   // Performance counter variables
   uint64_t t_fpga;
   struct timespec start, stop;

   // Derive bitstream file
   char bitstream[128];
   char str_batch_size[4];
   char str_block_out_size[4];
   char str_block_in_size[4];
   char str_block_bit_width[4];
   snprintf(str_batch_size, sizeof(str_batch_size), "%d", VTA_BATCH);
   snprintf(str_block_out_size, sizeof(str_block_out_size), "%d", VTA_BLOCK_OUT);
   snprintf(str_block_in_size, sizeof(str_block_in_size), "%d", VTA_BLOCK_IN);
   snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH);
   snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit");

   // Get VTA handles
   void* vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR);
   void* vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR);
   void* vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR);
   void* vta_store_handle = VTAMapRegister(VTA_STORE_ADDR);

   // Physical address pointers
   uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
   uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
   uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
   uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
   uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
   uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;

 #if VTA_DEBUG == 1
   printf("INFO - Starting FPGA!\n");
 #endif

   clock_gettime(CLOCK_REALTIME, &start);

   VTAWriteMappedReg(vta_fetch_handle, VTA_FETCH_INSN_COUNT_OFFSET, insn_count);
   if (insns) VTAWriteMappedReg(vta_fetch_handle, VTA_FETCH_INSN_ADDR_OFFSET, insn_phy);
   if (inputs) VTAWriteMappedReg(vta_load_handle, VTA_LOAD_INP_ADDR_OFFSET, input_phy);
   if (weights) VTAWriteMappedReg(vta_load_handle, VTA_LOAD_WGT_ADDR_OFFSET, weight_phy);
   if (uops) VTAWriteMappedReg(vta_compute_handle, VTA_COMPUTE_UOP_ADDR_OFFSET, uop_phy);
   if (biases) VTAWriteMappedReg(vta_compute_handle, VTA_COMPUTE_BIAS_ADDR_OFFSET, bias_phy);
   if (outputs) VTAWriteMappedReg(vta_store_handle, VTA_STORE_OUT_ADDR_OFFSET, output_phy);

   // VTA start
   VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
   VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
   VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
   VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);

   int flag = 0, t = 0;
   for (t = 0; t < 10000000; ++t) {
     flag = VTAReadMappedReg(vta_compute_handle, VTA_COMPUTE_DONE_RD_OFFSET);
     if (flag & VTA_DONE) break;
   }

   if (t == 10000000) {
     printf("\tWARNING: VTA TIMEOUT!!!!\n");
 #if VTA_DEBUG == 1
   } else {
     printf("INFO - FPGA Finished!\n");
 #endif
   }

   clock_gettime(CLOCK_REALTIME, &stop);
   t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);

   // Unmap VTA register
   VTAUnmapRegister(vta_fetch_handle);
   VTAUnmapRegister(vta_load_handle);
   VTAUnmapRegister(vta_compute_handle);
   VTAUnmapRegister(vta_store_handle);

   return t_fpga;
 }

 #endif  // VTA_TARGET_PYNQ
 #endif  // NO_SIM

 uint32_t globalSeed;

 const char* getOpcodeString(int opcode, bool use_imm) {
   // Returns string name
   if (opcode == VTA_ALU_OPCODE_MIN) {
     if (use_imm) {
       return "min imm";
     } else {
       return "min";
     }
   } else if (opcode == VTA_ALU_OPCODE_MAX) {
     if (use_imm) {
       return "max imm";
     } else {
       return "max";
     }
   } else if (opcode == VTA_ALU_OPCODE_ADD) {
     if (use_imm) {
       return "add imm";
     } else {
       return "add";
     }
   } else if (opcode == VTA_ALU_OPCODE_SHR) {
     return "shr";
   }
   // else if (opcode == VTA_ALU_OPCODE_MUL) {
   //   return "mul";
   // }
   return "unknown op";
 }

 template <typename DST_T, int DST_T_WIDTH, typename SRC_T, int SRC_T_WIDTH>
 void packBuffer(DST_T *dst, SRC_T **src, int y_size, int x_size, int y_block, int x_block) {
   assert((SRC_T_WIDTH * x_block * y_block) % DST_T_WIDTH  == 0);
   assert(DST_T_WIDTH <= 64);
   int buffer_idx = 0;
   int ratio = DST_T_WIDTH / SRC_T_WIDTH;
   long long int mask = (1ULL << SRC_T_WIDTH) - 1;
   DST_T tmp = 0;
   for (int i = 0; i < y_size / y_block; i++) {
     for (int j = 0; j < x_size / x_block; j++) {
       for (int k = 0; k < y_block; k++) {
         for (int l = 0; l < x_block; l++) {
           int block_idx = l + k * x_block;
           tmp |= (src[i * y_block + k][j * x_block + l] & mask) << ((block_idx % ratio) * SRC_T_WIDTH);
           // When tmp is packed, write to destination array
           if (block_idx % ratio == ratio - 1) {
             dst[buffer_idx++] = tmp;
             tmp = 0;
           }
         }
       }
     }
   }
 }

 template <typename DST_T, int DST_T_WIDTH, typename SRC_T, int SRC_T_WIDTH>
 void unpackBuffer(DST_T **dst, SRC_T *src, int y_size, int x_size, int y_block, int x_block) {
   assert((DST_T_WIDTH * x_block * y_block) % SRC_T_WIDTH == 0);
   int buffer_idx = 0;
   long long int mask = (1ULL << DST_T_WIDTH) - 1;
   int ratio = SRC_T_WIDTH / DST_T_WIDTH;
   for (int i = 0; i < y_size / y_block; i++) {
     for (int j = 0; j < x_size / x_block; j++) {
       for (int k = 0; k < y_block; k++) {
         for (int l = 0; l < x_block; l++) {
           int block_idx = l + k * x_block;
           dst[i * y_block + k][j * x_block + l] = (src[buffer_idx] >> ((block_idx % ratio) * DST_T_WIDTH)) & mask;
           if (block_idx % ratio == ratio - 1) {
             buffer_idx++;
           }
         }
       }
     }
   }
 }

 template <typename T>
 T ** allocInit2dArray(int rows, int cols) {
   // Allocate
   T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
   for (int i = 0; i < rows; i++) {
     array[i] = static_cast<T *>(malloc(sizeof(T) * cols));
   }
   // Init
   for (int i = 0; i < rows; i++) {
     for (int j = 0; j < cols; j++) {
       array[i][j] = static_cast<T>(rand_r(&globalSeed));
     }
   }
   return array;
 }

 template <typename T>
 T ** allocSet2dArray(int rows, int cols, int val) {
   // Allocate
   T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
   for (int i = 0; i < rows; i++) {
     array[i] = static_cast<T *>(malloc(sizeof(T) * cols));
   }
   // Init
   for (int i = 0; i < rows; i++) {
     for (int j = 0; j < cols; j++) {
       array[i][j] = static_cast<T>(val);
     }
   }
   return array;
 }

 template <typename T>
 T ** alloc2dArray(int rows, int cols) {
   T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
   for (int i = 0; i < rows; i++) {
     array[i] = static_cast<T *>(malloc(sizeof(T) * cols));
   }
   return array;
 }

 template <typename T>
 void free2dArray(T **array, int rows, int cols) {
   for (int i = 0; i < rows; i++) {
     free(array[i]);
   }
   free(array);
 }

 template <typename T>
 T *** alloc3dArray(int rows, int cols, int depth) {
   T ***array = static_cast<T ***>(malloc(sizeof(T **) * rows));
   for (int i = 0; i < rows; i++) {
     array[i] = static_cast<T **>(malloc(sizeof(T *) * cols));
     for (int j = 0; j < cols; j++) {
       array[i][j] = static_cast<T*>(malloc(sizeof(T) * depth));
     }
   }
   return array;
 }

 template <typename T>
 void free3dArray(T *** array, int rows, int cols, int depth) {
   for (int i = 0; i < rows; i++) {
     for (int j = 0; j < cols; j++) {
       free(array[i][j]);
     }
     free(array[i]);
   }
   free(array);
 }

 void * allocBuffer(size_t num_bytes) {
 #ifdef NO_SIM
   return VTAMemAlloc(num_bytes, VTA_CACHED);
 #else
   return malloc(num_bytes);
 #endif
 }

 void freeBuffer(void * buffer) {
 #ifdef NO_SIM
   return VTAMemFree(buffer);
 #else
   return free(buffer);
 #endif
 }

 VTAGenericInsn get2DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset,
     int y_size, int x_size, int x_stride, int y_pad, int x_pad, int pop_prev_dep, int pop_next_dep,
     int push_prev_dep, int push_next_dep) {
   // Converter
   union VTAInsn converter;
   // Memory instruction initialization
   VTAMemInsn insn = {};
   insn.opcode = opcode;
   insn.pop_prev_dep = pop_prev_dep;
   insn.pop_next_dep = pop_next_dep;
   insn.push_prev_dep = push_prev_dep;
   insn.push_next_dep = push_next_dep;
   insn.memory_type = type;
   insn.sram_base = sram_offset;
   insn.dram_base = dram_offset;
   insn.y_size = y_size;
   insn.x_size = x_size;
   insn.x_stride = x_stride;
   insn.y_pad_0 = y_pad;
   insn.y_pad_1 = y_pad;
   insn.x_pad_0 = x_pad;
   insn.x_pad_1 = x_pad;
   converter.mem = insn;
   return converter.generic;
 }

 VTAGenericInsn get1DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset, int size,
     int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep) {
   // Converter
   union VTAInsn converter;
   // Memory instruction initialization
   VTAMemInsn insn = {};
   insn.opcode = opcode;
   insn.pop_prev_dep = pop_prev_dep;
   insn.pop_next_dep = pop_next_dep;
   insn.push_prev_dep = push_prev_dep;
   insn.push_next_dep = push_next_dep;
   insn.memory_type = type;
   insn.sram_base = sram_offset;
   insn.dram_base = dram_offset;
   insn.y_size = 1;
   insn.x_size = size;
   insn.x_stride = size;
   insn.y_pad_0 = 0;
   insn.y_pad_1 = 0;
   insn.x_pad_0 = 0;
   insn.x_pad_1 = 0;
   converter.mem = insn;
   return converter.generic;
 }

 VTAGenericInsn getGEMMInsn(int uop_offset, int batch, int in_feat, int out_feat,
     bool uop_compression, int pop_prev_dep, int pop_next_dep, int push_prev_dep,
     int push_next_dep) {
   // Converter
   union VTAInsn converter;
   // GEMM instruction initialization
   VTAGemInsn insn;
   insn.opcode = VTA_OPCODE_GEMM;
   insn.pop_prev_dep = pop_prev_dep;
   insn.pop_next_dep = pop_next_dep;
   insn.push_prev_dep = push_prev_dep;
   insn.push_next_dep = push_next_dep;
   insn.reset_reg = false;
   if (!uop_compression) {
     insn.uop_bgn = uop_offset;
     insn.uop_end = uop_offset + batch * in_feat * out_feat;
     insn.iter_out = 1;
     insn.iter_in = 1;
     insn.dst_factor_out = 0;
     insn.src_factor_out = 0;
     insn.wgt_factor_out = 0;
     insn.dst_factor_in = 0;
     insn.src_factor_in = 0;
     insn.wgt_factor_in = 0;
   } else {
     insn.uop_bgn = uop_offset;
     insn.uop_end = uop_offset + batch;
     insn.iter_out = in_feat;
     insn.iter_in = out_feat;
     insn.dst_factor_out = 0;
     insn.src_factor_out = 1;
     insn.wgt_factor_out = 1;
     insn.dst_factor_in = 1;
     insn.src_factor_in = 0;
     insn.wgt_factor_in = in_feat;
   }
   converter.gemm = insn;
   return converter.generic;
 }

 VTAGenericInsn getALUInsn(int opcode, int vector_size, bool use_imm, int imm, bool uop_compression,
     int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep) {
   // Converter
   union VTAInsn converter;
   // Memory instruction initialization
   VTAAluInsn insn = {};
   insn.opcode = VTA_OPCODE_ALU;
   insn.pop_prev_dep = pop_prev_dep;
   insn.pop_next_dep = pop_next_dep;
   insn.push_prev_dep = push_prev_dep;
   insn.push_next_dep = push_next_dep;
   insn.reset_reg = false;
   if (!uop_compression) {
     insn.uop_bgn = 0;
     insn.uop_end = vector_size;
     insn.iter_out = 1;
     insn.iter_in = 1;
     insn.dst_factor_out = 0;
     insn.src_factor_out = 0;
     insn.dst_factor_in = 0;
     insn.src_factor_in = 0;
     insn.alu_opcode = opcode;
     insn.use_imm = use_imm;
     insn.imm = imm;
   } else {
     insn.uop_bgn = 0;
     insn.uop_end = 1;
     insn.iter_out = 1;
     insn.iter_in = vector_size;
     insn.dst_factor_out = 0;
     insn.src_factor_out = 0;
     insn.dst_factor_in = 1;
     insn.src_factor_in = 1;
     insn.alu_opcode = opcode;
     insn.use_imm = use_imm;
     insn.imm = imm;
   }
   converter.alu = insn;
   return converter.generic;
 }

 VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next) {
   // Converter
   union VTAInsn converter;
   // GEMM instruction initialization
   VTAGemInsn insn;
   insn.opcode = VTA_OPCODE_FINISH;
   insn.pop_prev_dep = pop_prev;
   insn.pop_next_dep = pop_next;
   insn.push_prev_dep = 0;
   insn.push_next_dep = 0;
   insn.reset_reg = false;
   insn.uop_bgn = 0;
   insn.uop_end = 0;
   insn.iter_out = 0;
   insn.iter_in = 0;
   insn.dst_factor_out = 0;
   insn.src_factor_out = 0;
   insn.wgt_factor_out = 0;
   insn.dst_factor_in = 0;
   insn.src_factor_in = 0;
   insn.wgt_factor_in = 0;
   converter.gemm = insn;
   return converter.generic;
 }

 VTAUop * getCopyUops(int y_size, int x_size, int uop_compression) {
   // Derive the total uop size
   int uop_size = (uop_compression) ? 1 : y_size * x_size;

   // Allocate buffer
 #ifdef NO_SIM
   VTAUop *uop_buf = static_cast<VTAUop *>(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED));
 #else
   VTAUop *uop_buf = static_cast<VTAUop *>(malloc(sizeof(VTAUop) * uop_size));
 #endif

   if (!uop_compression) {
     int uop_idx = 0;
     for (int i = 0; i < y_size; i++) {
       for (int j = 0; j < x_size; j++) {
         uop_buf[uop_idx].dst_idx = i * x_size + j;
         uop_buf[uop_idx].src_idx = 0;
         uop_buf[uop_idx].wgt_idx = 0;
         uop_idx++;
       }
     }
   } else {
     uop_buf[0].dst_idx = 1;
     uop_buf[0].src_idx = 0;
     uop_buf[0].wgt_idx = 0;
   }

   return uop_buf;
 }

 VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
     bool multi_threaded) {
   // Derive the total uop size
   int uop_size = (uop_compression) ? batch : batch * in_feat * out_feat;
   if (multi_threaded) uop_size *= 2;

   // Allocate buffer
 #ifdef NO_SIM
   VTAUop *uop_buf = static_cast<VTAUop *>(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED));
 #else
   VTAUop *uop_buf = static_cast<VTAUop *>(malloc(sizeof(VTAUop) * uop_size));
 #endif

   if (!uop_compression) {
     int uop_idx = 0;
     for (int i = 0; i < batch; i++) {
       for (int j = 0; j < in_feat; j++) {
         for (int k = 0; k < out_feat; k++) {
           uop_buf[uop_idx].dst_idx = i * out_feat + k;
           uop_buf[uop_idx].src_idx = i * in_feat + j;
           uop_buf[uop_idx].wgt_idx = k * in_feat + j;
           uop_idx++;
         }
       }
     }
   } else {
     for (int i = 0; i < batch; i++) {
       uop_buf[i].dst_idx = i * out_feat;
       uop_buf[i].src_idx = i * in_feat;
       uop_buf[i].wgt_idx = 0;
     }
   }

   if (multi_threaded) {
     if (!uop_compression) {
       int uop_idx = uop_size / 2;
       for (int i = 0; i < batch; i++) {
         for (int j = 0; j < in_feat; j++) {
           for (int k = 0; k < out_feat; k++) {
             uop_buf[uop_idx].dst_idx = i * out_feat + k;
             uop_buf[uop_idx].src_idx = batch * in_feat + i * in_feat + j;
             uop_buf[uop_idx].wgt_idx = out_feat * in_feat + k * in_feat + j;
             uop_idx++;
           }
         }
       }
     } else {
       for (int i = 0; i < batch; i++) {
         uop_buf[batch+i].dst_idx = i * out_feat;
         uop_buf[batch+i].src_idx = batch * in_feat + i * in_feat;
         uop_buf[batch+i].wgt_idx = out_feat * in_feat;
       }
     }
   }

   return uop_buf;
 }

 VTAUop * getMapALUUops(int vector_size, bool uop_compression) {
   // Derive the total uop size
   int uop_size = (uop_compression) ? 1 : vector_size;

   // Allocate buffer
 #ifdef NO_SIM
   VTAUop *uop_buf = static_cast<VTAUop *>(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED));
 #else
   VTAUop *uop_buf = static_cast<VTAUop *>(malloc(sizeof(VTAUop) * uop_size));
 #endif

   if (!uop_compression) {
     for (int i = 0; i < vector_size; i++) {
       uop_buf[i].dst_idx = i;
       uop_buf[i].src_idx = vector_size + i;
     }
   } else {
     uop_buf[0].dst_idx = 0;
     uop_buf[0].src_idx = vector_size;
   }

   return uop_buf;
 }

 void printParameters() {
   // Some debugging code
   printf("Size of VTAInsn: %d\n", sizeof(VTAGenericInsn));
   printf("Size of VTAUop: %d\n", sizeof(VTAUop));
   printf("VTA_UOP_BUFF_DEPTH: %d\n", VTA_UOP_BUFF_DEPTH);
   printf("VTA_LOG_UOP_BUFF_DEPTH: %d\n", VTA_LOG_UOP_BUFF_DEPTH);
   printf("VTA_WGT_BUFF_DEPTH: %d\n", VTA_WGT_BUFF_DEPTH);
   printf("VTA_LOG_WGT_BUFF_DEPTH: %d\n", VTA_LOG_WGT_BUFF_DEPTH);
   printf("VTA_INP_BUFF_DEPTH: %d\n", VTA_INP_BUFF_DEPTH);
   printf("VTA_LOG_INP_BUFF_DEPTH: %d\n", VTA_LOG_INP_BUFF_DEPTH);
   printf("VTA_ACC_BUFF_DEPTH: %d\n", VTA_ACC_BUFF_DEPTH);
   printf("VTA_LOG_ACC_BUFF_DEPTH: %d\n", VTA_LOG_ACC_BUFF_DEPTH);
   printf("VTA_WGT_WORDS: %d\n", VTA_WGT_BUFF_DEPTH*VTA_BLOCK_IN*VTA_BLOCK_OUT);
   printf("VTA_INP_WORDS: %d\n", VTA_INP_BUFF_DEPTH*VTA_BLOCK_IN);
   printf("VTA_ACC_WORDS: %d\n", VTA_ACC_BUFF_DEPTH*VTA_BLOCK_OUT);
   printf("VTA_INS_ELEM_BYTES: %d\n", VTA_INS_ELEM_BYTES);
   printf("VTA_UOP_ELEM_BYTES: %d\n", VTA_UOP_ELEM_BYTES);
   printf("VTA_INP_ELEM_BYTES: %d\n", VTA_INP_ELEM_BYTES);
   printf("VTA_WGT_ELEM_BYTES: %d\n", VTA_WGT_ELEM_BYTES);
   printf("VTA_ACC_ELEM_BYTES: %d\n", VTA_ACC_ELEM_BYTES);
   printf("VTA_BLOCK_IN: %d\n", VTA_BLOCK_IN);
   printf("VTA_BLOCK_OUT: %d\n", VTA_BLOCK_OUT);
 }

 void printInstruction(int num_insn, VTAGenericInsn *insns) {
   // Keep tabs on dependence queues
   int l2g_queue = 0;
   int g2l_queue = 0;
   int s2g_queue = 0;
   int g2s_queue = 0;
   // Converter
   union VTAInsn c;
   // Iterate over all instructions
   printf("DEBUG - There are %u instructions\n", num_insn);
   for (int i = 0; i < num_insn; i++) {
     // Fetch instruction and decode opcode
     c.generic = insns[i];
     printf("DEBUG - INSTRUCTION %u: ", i);
     if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
       // Print instruction field information
       if (c.mem.opcode == VTA_OPCODE_LOAD) {
         printf("LOAD ");
         if (c.mem.memory_type == VTA_MEM_ID_UOP) printf("UOP\n");
         if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n");
         if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n");
         if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n");
       }
       if (c.mem.opcode == VTA_OPCODE_STORE) {
         printf("STORE ACC\n");
       }
       printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
              static_cast<int>(c.mem.pop_prev_dep),
              static_cast<int>(c.mem.pop_next_dep),
              static_cast<int>(c.mem.push_prev_dep),
              static_cast<int>(c.mem.push_next_dep));
       printf("\tDRAM: 0x%08x, SRAM:0x%04x\n",
              static_cast<int>(c.mem.dram_base),
              static_cast<int>(c.mem.sram_base));
       printf("\ty: size=%d, pad=[%d, %d]\n",
              static_cast<int>(c.mem.y_size),
              static_cast<int>(c.mem.y_pad_0),
              static_cast<int>(c.mem.y_pad_1));
       printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n",
              static_cast<int>(c.mem.x_size),
              static_cast<int>(c.mem.x_stride),
              static_cast<int>(c.mem.x_pad_0),
              static_cast<int>(c.mem.x_pad_1));
       if (c.mem.opcode == VTA_OPCODE_STORE) {
         if (c.mem.pop_prev_dep) g2s_queue--;
         if (c.mem.push_prev_dep) s2g_queue++;
       } else if (c.mem.opcode == VTA_OPCODE_LOAD &&
         (c.mem.memory_type == VTA_MEM_ID_INP || c.mem.memory_type == VTA_MEM_ID_WGT)) {
         if (c.mem.pop_next_dep) g2l_queue--;
         if (c.mem.push_next_dep) l2g_queue++;
       } else {
         if (c.mem.pop_prev_dep) l2g_queue--;
         if (c.mem.push_prev_dep) g2l_queue++;
         if (c.mem.pop_next_dep) s2g_queue--;
         if (c.mem.push_next_dep) g2s_queue++;
       }
     } else if (c.mem.opcode == VTA_OPCODE_GEMM) {
       // Print instruction field information
       printf("GEMM\n");
       printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
              static_cast<int>(c.mem.pop_prev_dep),
              static_cast<int>(c.mem.pop_next_dep),
              static_cast<int>(c.mem.push_prev_dep),
              static_cast<int>(c.mem.push_next_dep));
       printf("\trange (%d, %d)\n",
              static_cast<int>(c.gemm.uop_bgn),
              static_cast<int>(c.gemm.uop_end));
       printf("\treset_out: %d\n", static_cast<int>(c.gemm.reset_reg));
       printf("\touter loop - iter: %d, acc: %d, inp: %d, wgt: %d\n",
              static_cast<int>(c.gemm.iter_out),
              static_cast<int>(c.gemm.dst_factor_out),
              static_cast<int>(c.gemm.src_factor_out),
              static_cast<int>(c.gemm.wgt_factor_out));
       printf("\tinner loop - iter: %d, acc: %d, inp: %d, wgt: %d\n",
              static_cast<int>(c.gemm.iter_in),
              static_cast<int>(c.gemm.dst_factor_in),
              static_cast<int>(c.gemm.src_factor_in),
              static_cast<int>(c.gemm.wgt_factor_in));
       if (c.gemm.pop_prev_dep) l2g_queue--;
       if (c.gemm.push_prev_dep) g2l_queue++;
       if (c.gemm.pop_next_dep) s2g_queue--;
       if (c.gemm.push_next_dep) g2s_queue++;
     } else if (c.mem.opcode == VTA_OPCODE_FINISH) {
       printf("FINISH\n");
       printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
              static_cast<int>(c.mem.pop_prev_dep),
              static_cast<int>(c.mem.pop_next_dep),
              static_cast<int>(c.mem.push_prev_dep),
              static_cast<int>(c.mem.push_next_dep));
       if (c.gemm.pop_prev_dep) l2g_queue--;
       if (c.gemm.push_prev_dep) g2l_queue++;
       if (c.gemm.pop_next_dep) s2g_queue--;
       if (c.gemm.push_next_dep) g2s_queue++;
     } else if (c.mem.opcode == VTA_OPCODE_ALU) {
       // Print instruction field information
       printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm));
       printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
              static_cast<int>(c.mem.pop_prev_dep),
              static_cast<int>(c.mem.pop_next_dep),
              static_cast<int>(c.mem.push_prev_dep),
              static_cast<int>(c.mem.push_next_dep));
       printf("\treset_out: %d\n", static_cast<int>(c.alu.reset_reg));
       printf("\trange (%d, %d)\n",
              static_cast<int>(c.alu.uop_bgn),
              static_cast<int>(c.alu.uop_end));
       printf("\touter loop - iter: %d, dst: %d, src: %d\n",
              static_cast<int>(c.alu.iter_out),
              static_cast<int>(c.alu.dst_factor_out),
              static_cast<int>(c.alu.src_factor_out));
       printf("\tinner loop - iter: %d, dst: %d, src: %d\n",
              static_cast<int>(c.alu.iter_in),
              static_cast<int>(c.alu.dst_factor_in),
              static_cast<int>(c.alu.src_factor_in));
       if (c.alu.pop_prev_dep) l2g_queue--;
       if (c.alu.push_prev_dep) g2l_queue++;
       if (c.alu.pop_next_dep) s2g_queue--;
       if (c.alu.push_next_dep) g2s_queue++;
     }
   }
   printf("DEBUG - l2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
   printf("DEBUG - s2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
 }

 // Helper function: Print micro-ops status
 void printMicroOp(int num_uop, VTAUop *uops) {
   // Iterate over all micro ops
   printf("DEBUG - There are %u micro-ops\n", num_uop);
   for (int i = 0; i < num_uop; i++) {
     // Read micro-op
     printf("DEBUG - UOP %u: ", i);
     printf("acc=%u, inp= %u, wgt=%u\n", uops[i].dst_idx, uops[i].src_idx, uops[i].wgt_idx);
   }
 }

 int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression) {
   // Some assertions
   assert(batch % VTA_BATCH == 0);
   assert(vector_size % VTA_BLOCK_OUT == 0);
   printf("=====================================================================================\n");
   printf("INFO - ALU test of %s: batch=%d, vector_size=%d, uop_compression=%d\n",
     getOpcodeString(opcode, use_imm), batch, vector_size, uop_compression);

   // Instruction count
   int ins_size = 3 * batch / VTA_BATCH + 2;
   // Micro op count
   int uop_size = uop_compression ? 1 : vector_size / VTA_BLOCK_OUT;
   // Input/output elements in each transfer
   int tx_size = vector_size / VTA_BLOCK_OUT;
   // Number of input sets to be generated
   int input_sets = (use_imm) ? 1 : 2;
   // Make sure we don't exceed buffer bounds
   assert(uop_size <= VTA_UOP_BUFF_DEPTH);
   assert(tx_size * input_sets <= VTA_ACC_BUFF_DEPTH);

   // Immediate values
   acc_T *immediate = static_cast<acc_T *>(malloc(sizeof(acc_T) * batch / VTA_BATCH));
   for (int b = 0; b < batch / VTA_BATCH; b++) {
     if (opcode == VTA_ALU_OPCODE_MIN) {
       immediate[b] = static_cast<acc_T>(
           rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
     } else if (opcode == VTA_ALU_OPCODE_MAX) {
       immediate[b] = static_cast<acc_T>(
           rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
     } else if (opcode == VTA_ALU_OPCODE_ADD) {
       immediate[b] = static_cast<acc_T>(
           rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
     } else if (opcode == VTA_ALU_OPCODE_SHR) {
       immediate[b] = static_cast<acc_T>(
           rand_r(&globalSeed) % (1LL << (VTA_SHR_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_SHR_ARG_BIT_WIDTH - 2)));
     }
     // else if (opcode == VTA_ALU_OPCODE_MUL) {
     //   immediate[b] = static_cast<acc_T>(
     //       rand_r(&globalSeed) % (1LL << (VTA_MUL_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_MUL_ARG_BIT_WIDTH - 2)));
     // }
   }

   // Initialize instructions
   VTAGenericInsn *insn_buf =
       static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
   int insn_idx = 0;
   insn_buf[insn_idx++] =
       get1DLoadStoreInsn(VTA_OPCODE_LOAD, VTA_MEM_ID_UOP, 0, 0, uop_size, 0, 0, 0, 0);
   for (int b = 0; b < batch; b += VTA_BATCH) {
     insn_buf[insn_idx++] = get2DLoadStoreInsn(
         VTA_OPCODE_LOAD,                   // opcode
         VTA_MEM_ID_ACC,                    // vector size
         0,                                 // sram offset
         b / VTA_BATCH * tx_size * input_sets,  // dram offset
         1,                                 // y size
         tx_size * input_sets,              // x size
         tx_size * input_sets,              // x stride
         0,                                 // y pad
         0,                                 // x pad
         0,                                 // pop prev dep
         b > 0,                             // pop next dep
         0,                                 // push prev dep
         0);                                // push next dep
     insn_buf[insn_idx++] = getALUInsn(
         opcode,                            // opcode
         tx_size,                           // vector size
         use_imm,                           // use imm
         immediate[b / VTA_BATCH],          // imm
         uop_compression,                   // uop compression
         0,                                 // pop prev dep
         0,                                 // pop next dep
         0,                                 // push prev dep
         1);                                // push next dep
     insn_buf[insn_idx++] = get2DLoadStoreInsn(
         VTA_OPCODE_STORE,                  // opcode
         VTA_MEM_ID_OUT,                    // vector size
         0,                                 // sram offset
         b / VTA_BATCH * tx_size,           // dram offset
         1,                                 // y size
         tx_size,                           // x size
         tx_size,                           // x stride
         0,                                 // y pad
         0,                                 // x pad
         1,                                 // pop prev dep
         0,                                 // pop next dep
         1,                                 // push prev dep
         0);                                // push next dep
   }
   // Finish
   insn_buf[insn_idx++] = getFinishInsn(0, 1);
   // Prepare the uop buffer
   VTAUop * uop_buf = getMapALUUops(tx_size, uop_compression);

 #if VTA_DEBUG == 1
   printInstruction(ins_size, insn_buf);
   printMicroOp(uop_size, uop_buf);
 #endif

   // Initialize the input/output data
   acc_T **inputs = alloc2dArray<acc_T>(batch, vector_size * input_sets);
   for (int i = 0; i < batch; i++) {
     for (int j = 0; j < vector_size * input_sets; j++) {
       if (opcode == VTA_ALU_OPCODE_MIN) {
         inputs[i][j] = static_cast<acc_T>(
             rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
       } else if (opcode == VTA_ALU_OPCODE_MAX) {
         inputs[i][j] = static_cast<acc_T>(
             rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
       } else if (opcode == VTA_ALU_OPCODE_ADD) {
         inputs[i][j] = static_cast<acc_T>(
             rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 2)) - (1LL << (VTA_INP_WIDTH - 3)));
       } else if (opcode == VTA_ALU_OPCODE_SHR) {
         inputs[i][j] = static_cast<acc_T>(
             rand_r(&globalSeed) % (1LL << (VTA_SHR_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_SHR_ARG_BIT_WIDTH - 2)));
       }
     }
   }

   // Compute reference output
   out_T **outputs_ref = alloc2dArray<out_T>(batch, vector_size);
   for (int i = 0; i < batch; i++) {
     for (int j = 0; j < vector_size; j++) {
       acc_T out_val = 0;
       acc_T imm_val = immediate[i / VTA_BATCH];
       acc_T src_val = inputs[i][j + vector_size];
       if (opcode == VTA_ALU_OPCODE_MIN) {
         if (!use_imm) {
           out_val = inputs[i][j] < src_val ? inputs[i][j] : src_val;
         } else {
           out_val = inputs[i][j] < imm_val ? inputs[i][j] : imm_val;
         }
       } else if (opcode == VTA_ALU_OPCODE_MAX) {
         if (!use_imm) {
           out_val = inputs[i][j] > src_val ? inputs[i][j] : src_val;
         } else {
           out_val = inputs[i][j] > imm_val ? inputs[i][j] : imm_val;
         }
       } else if (opcode == VTA_ALU_OPCODE_ADD) {
         if (!use_imm) {
           out_val = inputs[i][j] + src_val;
         } else {
           out_val = inputs[i][j] + imm_val;
         }
       } else if (opcode == VTA_ALU_OPCODE_SHR) {
         if (!use_imm) {
           if (src_val >= 0) {
             out_val = inputs[i][j] >> src_val;
           } else {
             out_val = inputs[i][j] << (0 - src_val);
           }
         } else {
           if (imm_val >= 0) {
             out_val = inputs[i][j] >> imm_val;
           } else {
             out_val = inputs[i][j] << (0 - imm_val);
           }
         }
       }
       outputs_ref[i][j] = (out_T) out_val;
     }
   }

   // Pack input buffer
   uint32_t *bias_buf = static_cast<uint32_t *>(
       allocBuffer(VTA_ACC_ELEM_BYTES * batch * tx_size * input_sets));
   packBuffer<uint32_t, 32, acc_T, VTA_ACC_WIDTH>(
       bias_buf, inputs, batch, vector_size * input_sets, VTA_BATCH, VTA_BLOCK_OUT);

   // Prepare output buffer
   uint32_t *output_buf = static_cast<uint32_t *>(
       allocBuffer(VTA_OUT_ELEM_BYTES * batch * tx_size * input_sets));

 #ifdef NO_SIM
   // Invoke the VTA
   uint64_t t_fpga = vta(ins_size, insn_buf, uop_buf, NULL, NULL, bias_buf, output_buf);
   // Report on timining
   printf("INFO - Synchronization time: %.3fms\n", static_cast<float>(t_fpga) / 1E6);
   printf("INFO - Throughput: %.3fGOps/s\n", static_cast<float>(vector_size * batch) / t_fpga);
 #else
   // Invoke the VTA
   vta(ins_size,
       (volatile insn_T *) insn_buf,
       (volatile uop_T *) uop_buf,
       (volatile bus_T *) NULL,
       (volatile bus_T *) NULL,
       (volatile bus_T *) bias_buf,
       (volatile bus_T *) output_buf);
 #endif

   // Unpack output buffer
   out_T **outputs = alloc2dArray<out_T>(batch, vector_size);
   unpackBuffer<out_T, VTA_OUT_WIDTH, uint32_t, 32>(outputs,
                                                    output_buf,
                                                    batch,
                                                    vector_size,
                                                    VTA_BATCH,
                                                    VTA_BLOCK_OUT);

   // Correctness checks
   int err = 0;
   for (int i = 0; i < batch; i++) {
     for (int j = 0; j < vector_size; j++) {
       if (outputs_ref[i][j] != outputs[i][j]) {
         err++;
 #if VTA_DEBUG == 1
         printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
                static_cast<int>(outputs_ref[i][j]),
                static_cast<int>(outputs[i][j]));
 #endif
       }
     }
   }

   // Free all allocated arrays
   free(immediate);
   free2dArray<acc_T>(inputs, batch, vector_size * input_sets);
   free2dArray<out_T>(outputs_ref, batch, vector_size);
   free2dArray<out_T>(outputs, batch, vector_size);
   freeBuffer(insn_buf);
   freeBuffer(uop_buf);
   freeBuffer(bias_buf);
   freeBuffer(output_buf);

   if (err == 0) {
     printf("INFO - ALU test successful!\n");
     return 0;
   } else {
     printf("INFO - ALU test failed, got %d errors!\n", err);
     return -1;
   }
 }

 int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
     int virtual_threads) {
   // Some assertions
   assert(block % VTA_BLOCK_IN == 0);
   assert(block % VTA_BLOCK_OUT == 0);
   assert(block % VTA_BATCH == 0);
   assert(channels % block == 0);
   assert(batch % block == 0);

   printf("=====================================================================================\n");
   printf("INFO - Blocked GEMM test: batch=%d, channels=%d, block=%d, uop_comp=%d, vt=%d\n",
          batch, channels, block, uop_compression, virtual_threads);

   // Input/output channels
   int in_feat = channels;
   int out_feat = channels;
   // Derive number of elements that need to be loaded/stored
   int ins_size = batch / block * out_feat / block * (2 + in_feat / block * 3) + 2;
   int uop_size = uop_compression ?
       block / VTA_BATCH * virtual_threads :
       block / VTA_BATCH * block / VTA_BLOCK_IN * block / VTA_BLOCK_OUT * virtual_threads;
   int inp_size = batch / VTA_BATCH * in_feat / VTA_BLOCK_IN;
   int wgt_size = in_feat / VTA_BLOCK_IN * out_feat / VTA_BLOCK_OUT;
   int out_size = batch / VTA_BATCH * out_feat / VTA_BLOCK_OUT;
   // Blocked buffer sizes (in terms of elements)
   int inp_block_size = block / VTA_BATCH * block / VTA_BLOCK_IN;
   int wgt_block_size = block / VTA_BLOCK_IN * block / VTA_BLOCK_OUT;
   int out_block_size = block / VTA_BATCH * block / VTA_BLOCK_OUT;
   // Make sure we don't exceed buffer bounds
   assert(uop_size <= VTA_UOP_BUFF_DEPTH);
   assert(inp_block_size <= VTA_INP_BUFF_DEPTH);
   assert(wgt_block_size <= VTA_WGT_BUFF_DEPTH);
   assert(out_block_size <= VTA_ACC_BUFF_DEPTH);

   // Initialize instruction buffer
   VTAGenericInsn *insn_buf =
       static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
   int insn_idx = 0;

   // Load uops
   insn_buf[insn_idx++] = get1DLoadStoreInsn(VTA_OPCODE_LOAD,
                                             VTA_MEM_ID_UOP,
                                             0,
                                             0,
                                             uop_size,
                                             0,
                                             0,
                                             0,
                                             0);
   // Iterate over batch blocks
   for (int i = 0; i < batch; i += block) {
     // Iterate over output channel blocks
     for (int j = 0; j < out_feat; j += block) {
       // Load bias block (pop next if not first, push prev)
       insn_buf[insn_idx++] = get2DLoadStoreInsn(
           VTA_OPCODE_LOAD,                                    // opcode
           VTA_MEM_ID_ACC,                                     // type
           0,                                                  // sram offset
           (i / VTA_BATCH * out_feat + j) / VTA_BLOCK_OUT,     // dram offset
           block / VTA_BATCH,                                  // y size
           block / VTA_BLOCK_OUT,                              // x size
           out_feat / VTA_BLOCK_OUT,                           // x stride
           0,                                                  // y pad
           0,                                                  // x pad
           0,                                                  // pop prev dep
           (i > 0 || j > 0),                                   // pop next dep
           (virtual_threads == 1),                             // push prev dep
           0);                                                 // push next dep
       // Iterate over input channel blocks
       for (int k = 0; k < in_feat; k += block * virtual_threads) {
         for (int l = 0; l < block * virtual_threads; l += block) {
           // Derive dependence flags
           bool pop = (virtual_threads == 1) ?
               1 :
               (i > 0 || j > 0 || k > 0 || l > 0) && (k + l != block * virtual_threads - block);
           bool push_prev = (virtual_threads == 1) ?
               ((k + l) != in_feat - block) :
               ((k + l) != in_feat - virtual_threads * block) &&
               (
                   (k + l != in_feat - block) ||
                   (j != out_feat - block) ||
                   (i != batch - block));
           bool push_next = (k + l == in_feat - block);
           // Load weight block (pop next)
           insn_buf[insn_idx++] = get2DLoadStoreInsn(
               VTA_OPCODE_LOAD,                                // opcode
               VTA_MEM_ID_WGT,                                 // type
               l / VTA_BLOCK_IN * block / VTA_BLOCK_OUT,       // sram offset
               (j / VTA_BLOCK_OUT * in_feat + k + l) / VTA_BLOCK_IN,  // dram offset
               block / VTA_BLOCK_OUT,                          // y size
               block / VTA_BLOCK_IN,                           // x size
               in_feat / VTA_BLOCK_IN,                         // x stride
               0,                                              // y pad
               0,                                              // x pad
               0,                                              // pop prev dep
               pop,                                            // pop next dep
               0,                                              // push prev dep
               0);                                             // push next dep
           // Load input block (push next)
           insn_buf[insn_idx++] = get2DLoadStoreInsn(
               VTA_OPCODE_LOAD,                                // opcode
               VTA_MEM_ID_INP,                                 // type
               l / VTA_BLOCK_IN * block / VTA_BATCH,           // sram offset
               (i / VTA_BATCH * in_feat + k + l) / VTA_BLOCK_IN,  // dram offset
               block / VTA_BATCH,                              // y size
               block / VTA_BLOCK_IN,                           // x size
               in_feat / VTA_BLOCK_IN,                         // x stride
               0,                                              // y pad
               0,                                              // x pad
               0,                                              // pop prev dep
               0,                                              // pop next dep
               0,                                              // push prev dep
               1);                                             // push next dep
           // Perform GEMM (pop prev, push prev if not last, push next if last)
           insn_buf[insn_idx++] = getGEMMInsn(
               l / block * uop_size / virtual_threads,         // uop offset
               block / VTA_BATCH,                              // batch
               block / VTA_BLOCK_IN,                           // in_feat
               block / VTA_BLOCK_OUT,                          // out_feat
               uop_compression,                                // uop_compression
               1,                                              // pop_prev_dep
               0,                                              // pop_next_dep
               push_prev,                                      // push prev dep
               push_next);                                     // push_next_dep
         }
       }
       // Store output block (pop prev, push prev if not last)
       insn_buf[insn_idx++] = get2DLoadStoreInsn(
           VTA_OPCODE_STORE,                                   // opcode
           VTA_MEM_ID_OUT,                                     // type
           0,                                                  // sram offset
           (i / VTA_BATCH * out_feat + j) / VTA_BLOCK_OUT,     // dram offset
           block / VTA_BATCH,                                  // y size
           block / VTA_BLOCK_OUT,                              // x size
           out_feat / VTA_BLOCK_OUT,                           // x stride
           0,                                                  // y pad
           0,                                                  // x pad
           1,                                                  // pop prev dep
           0,                                                  // pop next dep
           1,                                                  // pop prev dep
           0);                                                 // push next dep
     }
   }
   // Finish
   insn_buf[insn_idx++] = getFinishInsn(0, 1);

   // Prepare the uop buffer
   VTAUop * uop_buf = getGEMMUops(
       block / VTA_BATCH,
       block / VTA_BLOCK_IN,
       block / VTA_BLOCK_OUT,
       uop_compression,
       virtual_threads > 1);

 #if VTA_DEBUG == 1
   printInstruction(ins_size, insn_buf);
   printMicroOp(uop_size, uop_buf);
 #endif

   // Initialize inputs
   inp_T **inputs = allocInit2dArray<inp_T>(batch, in_feat);
   // Initialize weights
   wgt_T **weights = allocInit2dArray<wgt_T>(out_feat, in_feat);
   // Initialize biases
   acc_T **biases = allocInit2dArray<acc_T>(batch, out_feat);

   // Reference GEMM implementation
   out_T **outputs_ref = alloc2dArray<out_T>(batch, out_feat);
   for (int i = 0; i < batch; i++) {
     for (int j = 0; j < out_feat; j++) {
       acc_T sum = biases[i][j];
       for (int k = 0; k < in_feat; k++) {
         sum += (acc_T) (inputs[i][k] * weights[j][k]);
       }
       // Set
       outputs_ref[i][j] = (out_T) sum;
     }
   }

   // Prepare the input buffer
   uint32_t *input_buf = static_cast<uint32_t *>(
       allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
   packBuffer<uint32_t, 32, inp_T, VTA_INP_WIDTH>(input_buf,
                                                  inputs,
                                                  batch,
                                                  in_feat,
                                                  VTA_BATCH,
                                                  VTA_BLOCK_IN);
   // Prepare the weight buffer
   uint32_t *weight_buf = static_cast<uint32_t *>(
       allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
   packBuffer<uint32_t, 32, wgt_T, VTA_WGT_WIDTH>(weight_buf,
                                                  weights,
                                                  out_feat,
                                                  in_feat,
                                                  VTA_BLOCK_OUT,
                                                  VTA_BLOCK_IN);
   // Prepare the bias buffer
   uint32_t *bias_buf = static_cast<uint32_t *>(
       allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
   packBuffer<uint32_t, 32, acc_T, VTA_ACC_WIDTH>(bias_buf,
                                                  biases,
                                                  batch,
                                                  out_feat,
                                                  VTA_BATCH,
                                                  VTA_BLOCK_OUT);
   // Prepare the output buffer
   uint32_t *output_buf = static_cast<uint32_t *>(
       allocBuffer(VTA_INP_ELEM_BYTES * out_size));

 #ifdef NO_SIM
   // Invoke the VTA
   uint64_t t_fpga = vta(ins_size,
                         insn_buf,
                         uop_buf,
                         input_buf,
                         weight_buf,
                         bias_buf,
                         output_buf);
   // Report on timining
   printf("INFO - Synchronization time: %.3lfms\n", static_cast<float>(t_fpga) / 1E6);
   printf("INFO - Throughput: %.3lfGOPs/s\n",
          static_cast<float>(batch) * in_feat * out_feat * 2 / t_fpga);
 #else
   // Invoke the VTA
   vta(ins_size,
       (volatile insn_T *) insn_buf,
       (volatile uop_T *) uop_buf,
       (volatile bus_T *) input_buf,
       (volatile bus_T *) weight_buf,
       (volatile bus_T *) bias_buf,
       (volatile bus_T *) output_buf);
 #endif

   // Unpack output data
   out_T **outputs = alloc2dArray<out_T>(batch, out_feat);
   unpackBuffer<out_T, VTA_OUT_WIDTH, uint32_t, 32>(outputs,
                                                    output_buf,
                                                    batch,
                                                    out_feat,
                                                    VTA_BATCH,
                                                    VTA_BLOCK_OUT);

   // Correctness checks
   int err = 0;
   for (int i = 0; i < batch; i++) {
     for (int j = 0; j < out_feat; j++) {
       if (outputs_ref[i][j] != outputs[i][j]) {
         err++;
 #if VTA_DEBUG == 1
         printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
                static_cast<int>(outputs_ref[i][j]),
                static_cast<int>(outputs[i][j]));
 #endif
       }
     }
   }

   // Free all allocated arrays
   free2dArray<inp_T>(inputs, batch, in_feat);
   free2dArray<wgt_T>(weights, out_feat, in_feat);
   free2dArray<acc_T>(biases, batch, out_feat);
   free2dArray<out_T>(outputs_ref, batch, out_feat);
   free2dArray<out_T>(outputs, batch, out_feat);
   freeBuffer(insn_buf);
   freeBuffer(uop_buf);
   freeBuffer(input_buf);
   freeBuffer(weight_buf);
   freeBuffer(bias_buf);
   freeBuffer(output_buf);

   if (err == 0) {
     printf("INFO - Blocked GEMM test successful!\n");
     return 0;
   } else {
     printf("INFO - Blocked GEMM test failed, got %d errors!\n", err);
     return -1;
   }
 }


 int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression) {
   // Some assertions
   assert(batch % VTA_BATCH == 0);
   assert(in_channels % VTA_BLOCK_IN == 0);
   assert(out_channels % VTA_BLOCK_OUT == 0);

   printf("=====================================================================================\n");
   printf("INFO - Blocked GEMM test: batch=%d, in_channels=%d, out_channels=%d, uop_comp=%d\n",
          batch, in_channels, out_channels, uop_compression);

   // Derive number of elements that need to be loaded/stored
   int ins_size = 7;
   int uop_size = uop_compression ?
       batch / VTA_BATCH :
       batch / VTA_BATCH * in_channels / VTA_BLOCK_IN * out_channels / VTA_BLOCK_OUT;
   int inp_size = batch / VTA_BATCH * in_channels / VTA_BLOCK_IN;
   int wgt_size = in_channels / VTA_BLOCK_IN * out_channels / VTA_BLOCK_OUT;
   int out_size = batch / VTA_BATCH * out_channels / VTA_BLOCK_OUT;
   // Make sure we don't exceed buffer bounds
   assert(uop_size <= VTA_UOP_BUFF_DEPTH);
   assert(inp_size <= VTA_INP_BUFF_DEPTH);
   assert(wgt_size <= VTA_WGT_BUFF_DEPTH);
   assert(out_size <= VTA_ACC_BUFF_DEPTH);

   // Initialize instruction buffer
   VTAGenericInsn *insn_buf =
       static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
   int insn_idx = 0;

   // Load uops
   insn_buf[insn_idx++] = get1DLoadStoreInsn(
       VTA_OPCODE_LOAD,
       VTA_MEM_ID_UOP,
       0,
       0,
       uop_size,
       0,
       0,
       0,
       0);
   // Load bias
   insn_buf[insn_idx++] = get1DLoadStoreInsn(
       VTA_OPCODE_LOAD,                                    // opcode
       VTA_MEM_ID_ACC,                                     // type
       0,                                                  // sram offset
       0,                                                  // dram offset
       out_size,                                           // size
       0,                                                  // pop prev dep
       0,                                                  // pop next dep
       1,                                                  // push prev dep
       0);                                                 // push next dep
   // Load weight block (pop next)
   insn_buf[insn_idx++] = get1DLoadStoreInsn(
       VTA_OPCODE_LOAD,                                    // opcode
       VTA_MEM_ID_WGT,                                     // type
       0,                                                  // sram offset
       0,                                                  // dram offset
       wgt_size,                                           // size
       0,                                                  // pop prev dep
       1,                                                  // pop next dep
       0,                                                  // push prev dep
       0);                                                 // push next dep
   // Load input block (push next)
   insn_buf[insn_idx++] = get1DLoadStoreInsn(
       VTA_OPCODE_LOAD,                                    // opcode
       VTA_MEM_ID_INP,                                     // type
       0,                                                  // sram offset
       0,                                                  // dram offset
       inp_size,                                           // size
       0,                                                  // pop prev dep
       0,                                                  // pop next dep
       0,                                                  // push prev dep
       1);                                                 // push next dep
   // Perform GEMM (pop prev, push prev if not last, push next if last)
   insn_buf[insn_idx++] = getGEMMInsn(
       0,                                                  // uop offset
       batch / VTA_BATCH,                                  // batch
       in_channels / VTA_BLOCK_IN,                         // in_channels
       out_channels / VTA_BLOCK_OUT,                       // out_channels
       uop_compression,                                    // uop_compression
       1,                                                  // pop_prev_dep
       0,                                                  // pop_next_dep
       0,                                                  // push prev dep
       1);                                                 // push_next_dep
   // Store output block (pop prev, push prev if not last)
   insn_buf[insn_idx++] = get1DLoadStoreInsn(
       VTA_OPCODE_STORE,                                   // opcode
       VTA_MEM_ID_OUT,                                     // type
       0,                                                  // sram offset
       0,                                                  // dram offset
       out_size,                                           // size
       1,                                                  // pop prev dep
       0,                                                  // pop next dep
       1,                                                  // push prev dep
       0);                                                 // push next dep
   // Finish
   insn_buf[insn_idx++] = getFinishInsn(0, 1);

   // Prepare the uop buffer
   VTAUop * uop_buf = getGEMMUops(
       batch / VTA_BATCH,
       in_channels / VTA_BLOCK_IN,
       out_channels / VTA_BLOCK_OUT,
       uop_compression,
       0);

 #if VTA_DEBUG == 1
   printInstruction(ins_size, insn_buf);
   printMicroOp(uop_size, uop_buf);
 #endif

   // Initialize inputs
   inp_T **inputs = allocInit2dArray<inp_T>(batch, in_channels);
   // Initialize weights
   wgt_T **weights = allocInit2dArray<wgt_T>(out_channels, in_channels);
   // Initialize biases
   acc_T **biases = allocInit2dArray<acc_T>(batch, out_channels);

   // Reference GEMM implementation
   out_T **outputs_ref = alloc2dArray<out_T>(batch, out_channels);
   for (int i = 0; i < batch; i++) {
     for (int j = 0; j < out_channels; j++) {
       acc_T sum = biases[i][j];
       for (int k = 0; k < in_channels; k++) {
         sum += (acc_T) (inputs[i][k] * weights[j][k]);
       }
       // Set
       outputs_ref[i][j] = (out_T) sum;
     }
   }

   // Prepare the input buffer
   uint32_t *input_buf = static_cast<uint32_t *>(allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
   packBuffer<uint32_t, 32, inp_T, VTA_INP_WIDTH>(input_buf,
                                                  inputs,
                                                  batch,
                                                  in_channels,
                                                  VTA_BATCH,
                                                  VTA_BLOCK_IN);
   // Prepare the weight buffer
   uint32_t *weight_buf = static_cast<uint32_t *>(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
   packBuffer<uint32_t, 32, wgt_T, VTA_WGT_WIDTH>(weight_buf,
                                                  weights,
                                                  out_channels,
                                                  in_channels,
                                                  VTA_BLOCK_OUT,
                                                  VTA_BLOCK_IN);
   // Prepare the bias buffer
   uint32_t *bias_buf = static_cast<uint32_t *>(allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
   packBuffer<uint32_t, 32, acc_T, VTA_ACC_WIDTH>(bias_buf,
                                                  biases,
                                                  batch,
                                                  out_channels,
                                                  VTA_BATCH,
                                                  VTA_BLOCK_OUT);
   // Prepare the output buffer
   uint32_t *output_buf = static_cast<uint32_t *>(allocBuffer(VTA_OUT_ELEM_BYTES * out_size));

 #ifdef NO_SIM
   // Invoke the VTA
   uint64_t t_fpga = vta(ins_size,
                         insn_buf,
                         uop_buf,
                         input_buf,
                         weight_buf,
                         bias_buf,
                         output_buf);
   // Report on timining
   printf("INFO - Synchronization time: %.3lfms\n", static_cast<float>(t_fpga) / 1E6);
   printf("INFO - Throughput: %.3lfGOPs/s\n",
          static_cast<float>(batch) * in_channels * out_channels * 2 / t_fpga);
 #else
   // Invoke the VTA
   vta(ins_size,
       (volatile insn_T *) insn_buf,
       (volatile uop_T *) uop_buf,
       (volatile bus_T *) input_buf,
       (volatile bus_T *) weight_buf,
       (volatile bus_T *) bias_buf,
       (volatile bus_T *) output_buf);
 #endif

   // Unpack output data
   out_T **outputs = alloc2dArray<out_T>(batch, out_channels);
   unpackBuffer<out_T, VTA_OUT_WIDTH, uint32_t, 32>(outputs,
                                                    output_buf,
                                                    batch,
                                                    out_channels,
                                                    VTA_BATCH,
                                                    VTA_BLOCK_OUT);

   // Correctness checks
   int err = 0;
   for (int i = 0; i < batch; i++) {
     for (int j = 0; j < out_channels; j++) {
       if (outputs_ref[i][j] != outputs[i][j]) {
         err++;
 #if VTA_DEBUG == 1
         printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
                static_cast<int>(outputs_ref[i][j]),
                static_cast<int>(outputs[i][j]));
 #endif
       }
     }
   }

   // Free all allocated arrays
   free2dArray<inp_T>(inputs, batch, in_channels);
   free2dArray<wgt_T>(weights, out_channels, in_channels);
   free2dArray<acc_T>(biases, batch, out_channels);
   free2dArray<out_T>(outputs_ref, batch, out_channels);
   free2dArray<out_T>(outputs, batch, out_channels);
   freeBuffer(insn_buf);
   freeBuffer(uop_buf);
   freeBuffer(input_buf);
   freeBuffer(weight_buf);
   freeBuffer(bias_buf);
   freeBuffer(output_buf);

   if (err == 0) {
     printf("INFO - Blocked GEMM test successful!\n");
     return 0;
   } else {
     printf("INFO - Blocked GEMM test failed, got %d errors!\n", err);
     return -1;
   }
 }