hardware/xilinx/src/vta.cc - tvm-vta - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file vta.cpp
  * \brief VTA HLS design.
  */

 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

 #include "vta.h"

 template <typename DATA_T, int MAT_AXI_RATIO>
 void reset_mem(
   memop_sram_T &sram_idx,
   memop_sram_T range,
   DATA_T mem[][MAT_AXI_RATIO]) {

   for (int i = 0; i < range; i ++) {
     for (int j = 0; j < MAT_AXI_RATIO; j ++) {
 #pragma HLS UNROLL
       mem[sram_idx][j] = 0;
     }
     sram_idx ++;
   }
 }

 template <typename DATA_T, int MAT_AXI_RATIO, int ELEM_BYTES>
 void load_pad_2d(
   volatile DATA_T *src,
   DATA_T dst[][MAT_AXI_RATIO],
   memop_sram_T sram_idx,
   memop_dram_T dram_idx,
   memop_size_T y_size,
   memop_size_T x_size,
   memop_stride_T x_stride,
   memop_pad_T x_pad_0,
   memop_pad_T x_pad_1,
   memop_sram_T y_offset_0,
   memop_sram_T y_offset_1) {
 #pragma HLS INLINE

   reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, y_offset_0, dst);
   for (int y = 0; y < y_size; y++) {
 #pragma HLS PIPELINE
     reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, x_pad_0, dst);
     memcpy(&dst[sram_idx][0],
            (const DATA_T*) &src[dram_idx * MAT_AXI_RATIO],
            x_size * ELEM_BYTES);
     sram_idx += x_size;
     dram_idx += x_stride;
     reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, x_pad_1, dst);
   }
   reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, y_offset_1, dst);
 }

 template <typename DATA_T, int MAT_AXI_RATIO, int ELEM_BYTES>
 void load_2d(
   volatile DATA_T *src,
   DATA_T dst[][MAT_AXI_RATIO],
   memop_sram_T sram_idx,
   memop_dram_T dram_idx,
   memop_size_T y_size,
   memop_size_T x_size,
   memop_stride_T x_stride) {
 #pragma HLS INLINE

   for (int y = 0; y < y_size; y++) {
     memcpy(&dst[sram_idx][0],
            (const DATA_T*) &src[dram_idx * MAT_AXI_RATIO],
            x_size * ELEM_BYTES);
 #pragma HLS RESOURCE variable = sram_idx core = Mul_LUT
     sram_idx += x_size;
     dram_idx += x_stride;
   }
 }

 template <typename WIDE_T, typename NARROW_T, typename IDX_T, int WIDE_W, int NARROW_W, int Y_DIM, int X_DIM>
 void read_tensor(
   IDX_T idx,
   WIDE_T src[][NARROW_W * Y_DIM * X_DIM / WIDE_W],
   NARROW_T dst[Y_DIM][X_DIM]) {
 #pragma HLS INLINE

   // Read in weight tensor
   for (int p = 0; p < NARROW_W * Y_DIM * X_DIM / WIDE_W; p++) {
     WIDE_T packet = src[idx][p];
     for (int w = 0; w < (WIDE_W / NARROW_W); w++) {
       int x = (p * (WIDE_W / NARROW_W) + w) / X_DIM;
       int y = (p * (WIDE_W / NARROW_W) + w) % X_DIM;
       dst[x][y] = (NARROW_T) packet.range((w + 1) * NARROW_W - 1, w * NARROW_W);
     }
   }
 }

 template <typename WIDE_T, typename NARROW_T, typename IDX_T, int WIDE_W, int NARROW_W, int Y_DIM, int X_DIM>
 void write_tensor(
   IDX_T idx,
   NARROW_T src[Y_DIM][X_DIM],
   WIDE_T dst[][NARROW_W * Y_DIM * X_DIM / WIDE_W]) {
 #pragma HLS INLINE

   for (int p = 0; p < NARROW_W * Y_DIM * X_DIM / WIDE_W; p++) {
     WIDE_T packet = 0;
     for (int w = 0; w < (WIDE_W / NARROW_W); w++) {
       int x = (p * (WIDE_W / NARROW_W) + w) / X_DIM;
       int y = (p * (WIDE_W / NARROW_W) + w) % X_DIM;
       packet.range((w + 1) * NARROW_W - 1, w * NARROW_W) = src[x][y];
     }
     dst[idx][p] = packet;
   }
 }

 void fetch(
   uint32_t insn_count,
   volatile insn_T *insns,
   hls::stream<insn_T> &load_queue,
   hls::stream<insn_T> &gemm_queue,
   hls::stream<insn_T> &store_queue) {
 PRAGMA_HLS(HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS offset = VTA_FETCH_INSN_COUNT_OFFSET)
 #pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port
 #pragma HLS INTERFACE axis port = load_queue
 #pragma HLS INTERFACE axis port = gemm_queue
 #pragma HLS INTERFACE axis port = store_queue
 #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS

   INSN_DECODE: for (int pc = 0; pc < insn_count; pc++) {
 #pragma HLS PIPELINE
     // Read instruction fields
     insn_T raw_insn = insns[pc];
     VTAInsn insn;
     insn.generic = *((VTAGenericInsn *) &raw_insn);
     // Do some partial decoding
     opcode_T opcode = insn.generic.opcode;
     memop_id_T memory_type = insn.mem.memory_type;
     // Push to appropriate instruction queue
     if (opcode == VTA_OPCODE_STORE) {
       store_queue.write(raw_insn);
     } else if (opcode == VTA_OPCODE_LOAD) {
       if (memory_type == VTA_MEM_ID_INP || memory_type == VTA_MEM_ID_WGT) {
         load_queue.write(raw_insn);
       } else {
         gemm_queue.write(raw_insn);
       }
     } else {
       gemm_queue.write(raw_insn);
     }
   }
 }

 void load(
   volatile bus_T *inputs,
   volatile bus_T *weights,
   hls::stream<insn_T> &load_queue,
   hls::stream<bool> &g2l_dep_queue,
   hls::stream<bool> &l2g_dep_queue,
   bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
   bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO]) {
 #pragma HLS INTERFACE m_axi port = inputs offset = slave bundle = data_port
 #pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port
 #pragma HLS INTERFACE axis port = load_queue
 #pragma HLS INTERFACE axis port = g2l_dep_queue
 #pragma HLS INTERFACE axis port = l2g_dep_queue
 #pragma HLS INTERFACE bram port = wgt_mem
 #pragma HLS INTERFACE bram port = inp_mem
 #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
 #pragma HLS RESOURCE variable = inp_mem core = RAM_1P
 #pragma HLS RESOURCE variable = wgt_mem core = RAM_1P

   // Pop load instruction
   insn_T raw_insn = load_queue.read();
   // Cast to MemInsn
   insn_T raw_copy = raw_insn;
   VTAMemInsn insn = *((VTAMemInsn *) &raw_copy);

   // Pop dependence token if instructed
   if (insn.pop_next_dep) {
     g2l_dep_queue.read();
   }

   // Pre-processing
   memop_sram_T x_width = (insn.x_pad_0 + insn.x_size + insn.x_pad_1);
   memop_sram_T y_offset_0 = x_width * insn.y_pad_0;
 #pragma HLS RESOURCE variable = y_offset_0 core = Mul_LUT latency = 4
   memop_sram_T y_offset_1 = x_width * insn.y_pad_1;
 #pragma HLS RESOURCE variable = y_offset_1 core = Mul_LUT latency = 4

   if (insn.memory_type == VTA_MEM_ID_INP) {
     load_pad_2d<bus_T, INP_MAT_AXI_RATIO, VTA_INP_ELEM_BYTES>(
         inputs,
         inp_mem,
         insn.sram_base,
         insn.dram_base,
         insn.y_size,
         insn.x_size,
         insn.x_stride,
         insn.x_pad_0,
         insn.x_pad_1,
         y_offset_0,
         y_offset_1);
   } else if (insn.memory_type == VTA_MEM_ID_WGT) {
     load_2d<bus_T, WGT_MAT_AXI_RATIO, VTA_WGT_ELEM_BYTES>(
         weights,
         wgt_mem,
         insn.sram_base,
         insn.dram_base,
         insn.y_size,
         insn.x_size,
         insn.x_stride);
   }

   // Push dependence token if instructed
   if (insn.push_next_dep) {
     l2g_dep_queue.write(1);
   }
 }

 void gemm(
   insn_T insn_raw,
   uop_T uop_mem[VTA_UOP_BUFF_DEPTH],
   bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO],
   bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
   bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
   bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) {
 #pragma HLS INLINE

   VTAGemInsn insn = *((VTAGemInsn *) &insn_raw);

   // Loop offset
   acc_idx_T dst_offset_out = 0;
   inp_idx_T src_offset_out = 0;
   wgt_idx_T wgt_offset_out = 0;

   // Outer Loop
   EXE_OUT_LOOP: for (int it_out = 0; it_out < insn.iter_out; it_out++) {
     acc_idx_T dst_offset_in = dst_offset_out;
     inp_idx_T src_offset_in = src_offset_out;
     wgt_idx_T wgt_offset_in = wgt_offset_out;

     // Inner Loop
     EXE_IN_LOOP: for (int it_in = 0; it_in < insn.iter_in; it_in++) {

       // Iterate over micro op
       READ_GEMM_UOP: for (int upc = insn.uop_bgn; upc < insn.uop_end; upc++) {
 #pragma HLS PIPELINE II = 1
         // Read micro-op fields
         uop_T uop = uop_mem[upc];

         // Decode indices
         acc_idx_T dst_idx =
             uop.range(VTA_UOP_GEM_0_1, VTA_UOP_GEM_0_0) + dst_offset_in;
         inp_idx_T src_idx =
             uop.range(VTA_UOP_GEM_1_1, VTA_UOP_GEM_1_0) + src_offset_in;
         wgt_idx_T wgt_idx =
             uop.range(VTA_UOP_GEM_2_1, VTA_UOP_GEM_2_0) + wgt_offset_in;

         // Read in weight tensor
         wgt_T w_tensor[VTA_BLOCK_OUT][VTA_BLOCK_IN];
         read_tensor<bus_T, wgt_T, wgt_idx_T, VTA_BUS_WIDTH, VTA_WGT_WIDTH, VTA_BLOCK_OUT, VTA_BLOCK_IN>(wgt_idx, wgt_mem, w_tensor);
         // Read in input tensor
         inp_T i_tensor[VTA_BATCH][VTA_BLOCK_IN];
         read_tensor<bus_T, inp_T, inp_idx_T, VTA_BUS_WIDTH, VTA_INP_WIDTH, VTA_BATCH, VTA_BLOCK_IN>(src_idx, inp_mem, i_tensor);
         // Read in accum tensor
         acc_T a_tensor[VTA_BATCH][VTA_BLOCK_OUT];
         read_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, acc_mem, a_tensor);
         // Output tensor
         out_T o_tensor[VTA_BATCH][VTA_BLOCK_OUT];

         // Inner GEMM loop
         for (int b = 0; b < VTA_BATCH; b++) {
           for (int oc = 0; oc < VTA_BLOCK_OUT; oc++) {
             // Initialize the accumulator values
             acc_T accum = a_tensor[b][oc];
             // Dot product sum
             sum_T tmp = 0;
             // Inner matrix multiplication loop (input channel/feature)
             for (int ic = 0; ic < VTA_BLOCK_IN; ic++) {
               wgt_T w_elem = w_tensor[oc][ic];
               inp_T i_elem = i_tensor[b][ic];
               mul_T prod_dsp = i_elem * w_elem;
               tmp += (sum_T) prod_dsp;
             }
             // Update summation
             accum += (acc_T) tmp;
             // Write back result acc_mem
             a_tensor[b][oc] = insn.reset_reg ? (acc_T) 0 : accum;
             // And output vector
             o_tensor[b][oc] = (out_T) accum.range(VTA_OUT_WIDTH - 1, 0);
           }
         }

         // Write the results back into accumulator
         write_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, a_tensor, acc_mem);
         // Write the results back in the output buffer
         write_tensor<bus_T, out_T, acc_idx_T, VTA_BUS_WIDTH, VTA_OUT_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, o_tensor, out_mem);
       }
       // Update offsets
       dst_offset_in += insn.dst_factor_in;
       src_offset_in += insn.src_factor_in;
       wgt_offset_in += insn.wgt_factor_in;
     }
     // Update offsets
     dst_offset_out += insn.dst_factor_out;
     src_offset_out += insn.src_factor_out;
     wgt_offset_out += insn.wgt_factor_out;
   }
 }

 void alu(
   insn_T insn_raw,
   uop_T uop_mem[VTA_UOP_BUFF_DEPTH],
   bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO],
   bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
   bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
   bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) {
 #pragma HLS INLINE

   VTAAluInsn insn = *((VTAAluInsn *) &insn_raw);

   // Loop offset
   acc_idx_T dst_offset_out = 0;
   inp_idx_T src_offset_out = 0;

   // Outer Loop
   EXE_OUT_LOOP: for (int it_out = 0; it_out < insn.iter_out; it_out++) {
     acc_idx_T dst_offset_in = dst_offset_out;
     inp_idx_T src_offset_in = src_offset_out;

     // Inner Loop
     EXE_IN_LOOP: for (int it_in = 0; it_in < insn.iter_in; it_in++) {
       // Iterate over micro op
       READ_ALU_UOP: for (int upc = insn.uop_bgn; upc < insn.uop_end; upc++) {
 #pragma HLS PIPELINE II = 2
         // Read micro-op fields
         uop_T uop = uop_mem[upc];

         // Decode
         acc_idx_T dst_idx =
             uop.range(VTA_UOP_ALU_0_1, VTA_UOP_ALU_0_0) + dst_offset_in;
         acc_idx_T src_idx =
             uop.range(VTA_UOP_ALU_1_1, VTA_UOP_ALU_1_0) + src_offset_in;

         // Read in src tensor
         acc_T src_tensor[VTA_BATCH][VTA_BLOCK_OUT];
         read_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(src_idx, acc_mem, src_tensor);
         // Read in dst tensor
         acc_T dst_tensor[VTA_BATCH][VTA_BLOCK_OUT];
         read_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, acc_mem, dst_tensor);
         // Output tensor
         out_T o_tensor[VTA_BATCH][VTA_BLOCK_OUT];

         // Perform ALU op over matrix elements
         for (int i = 0; i < VTA_BATCH; i++) {
           for (int b = 0; b < VTA_BLOCK_OUT; b++) {
             // Read in operands
             acc_T src_0 = dst_tensor[i][b];
             acc_T src_1 = insn.use_imm ? (acc_T) insn.imm : src_tensor[i][b];
             aluop_shr_arg_T shft_by = src_1.range(VTA_SHR_ARG_BIT_WIDTH - 1, 0);
             aluop_mul_arg_T mul_by = src_1.range(VTA_MUL_ARG_BIT_WIDTH - 1, 0);
             if (insn.alu_opcode == VTA_ALU_OPCODE_MIN || insn.alu_opcode == VTA_ALU_OPCODE_MAX) {
               // Compute Min/Max
               acc_T mix_val = src_0 < src_1 ?
                   (insn.alu_opcode == VTA_ALU_OPCODE_MIN ? src_0 : src_1) :
                   (insn.alu_opcode == VTA_ALU_OPCODE_MIN ? src_1 : src_0);
               dst_tensor[i][b] = mix_val;
               o_tensor[i][b] = (out_T) mix_val.range(VTA_OUT_WIDTH - 1, 0);
             } else if (insn.alu_opcode == VTA_ALU_OPCODE_ADD) {
               // Compute Sum
               acc_T add_val =
                   src_0.range(VTA_ACC_WIDTH - 1, 0) + src_1.range(VTA_ACC_WIDTH - 1, 0);
               dst_tensor[i][b] = add_val;
               o_tensor[i][b] = (out_T) add_val.range(VTA_OUT_WIDTH - 1, 0);
             } else if (insn.alu_opcode == VTA_ALU_OPCODE_SHR) {
               // Compute Shift Right
               acc_T shr_val = src_0 >> shft_by;
               dst_tensor[i][b] = shr_val;
               o_tensor[i][b] = (out_T) shr_val.range(VTA_OUT_WIDTH - 1, 0);
             }
           }
         }

         // Write the results back into accumulator
         write_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, dst_tensor, acc_mem);
         // Write the results back in the output buffer
         write_tensor<bus_T, out_T, acc_idx_T, VTA_BUS_WIDTH, VTA_OUT_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, o_tensor, out_mem);
       }
       // Update offsets
       dst_offset_in += insn.dst_factor_in;
       src_offset_in += insn.src_factor_in;
     }
     // Update offsets
     dst_offset_out += insn.dst_factor_out;
     src_offset_out += insn.src_factor_out;
   }
 }

 void compute(
   volatile uint32_t &done,
   volatile uop_T *uops,
   volatile bus_T *biases,
   hls::stream<insn_T> &gemm_queue,
   hls::stream<bool> &l2g_dep_queue,
   hls::stream<bool> &s2g_dep_queue,
   hls::stream<bool> &g2l_dep_queue,
   hls::stream<bool> &g2s_dep_queue,
   bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
   bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
   bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) {
 PRAGMA_HLS(HLS INTERFACE s_axilite port = done bundle = CONTROL_BUS offset = VTA_COMPUTE_DONE_WR_OFFSET)
 #pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port
 #pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port
 #pragma HLS INTERFACE axis port = gemm_queue
 #pragma HLS INTERFACE axis port = l2g_dep_queue
 #pragma HLS INTERFACE axis port = s2g_dep_queue
 #pragma HLS INTERFACE axis port = g2l_dep_queue
 #pragma HLS INTERFACE axis port = g2s_dep_queue
 #pragma HLS INTERFACE bram port = inp_mem
 #pragma HLS INTERFACE bram port = wgt_mem
 #pragma HLS INTERFACE bram port = out_mem
 #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
 #pragma HLS RESOURCE variable = inp_mem core = RAM_1P
 #pragma HLS RESOURCE variable = wgt_mem core = RAM_1P
 #pragma HLS RESOURCE variable = out_mem core = RAM_1P

   // Micro-op storage
   static uop_T uop_mem[VTA_UOP_BUFF_DEPTH];

   // Accumulator storage
   static bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO];
 #pragma HLS ARRAY_RESHAPE variable = acc_mem complete dim=2
 // This is necessary to obtain II=1
 #pragma HLS DEPENDENCE variable = acc_mem inter false

   // Pop GEMM instruction
   insn_T raw_insn = gemm_queue.read();
   // Cast to GenericInsn
   VTAInsn insn;
   insn_T raw_copy = raw_insn;
   insn.generic = *((VTAGenericInsn *) &raw_copy);

   // Pop dependence token if instructed
   if (insn.generic.pop_prev_dep) {
     l2g_dep_queue.read();
   }
   if (insn.generic.pop_next_dep) {
     s2g_dep_queue.read();
   }

   // Set done value
   done = 0;
   // Perform action based on opcode
   if (insn.generic.opcode == VTA_OPCODE_FINISH) {
     // Set done flag if we reach a FINISH instruction
     done = 1;
   } else if (insn.generic.opcode == VTA_OPCODE_LOAD) {
     // Initialize indices
     memop_sram_T sram_idx = insn.mem.sram_base;
     memop_dram_T dram_idx = insn.mem.dram_base;
     if (insn.mem.memory_type == VTA_MEM_ID_UOP) {
       // Perform data transfer
       memcpy(&uop_mem[sram_idx],
              (const uop_T*) &uops[dram_idx],
              insn.mem.x_size * sizeof(uop_T));
     } else if (insn.mem.memory_type == VTA_MEM_ID_ACC) {
       // Perform data transfer from DRAM
       load_2d<bus_T, ACC_MAT_AXI_RATIO, VTA_ACC_ELEM_BYTES>(
           biases,
           acc_mem,
           sram_idx,
           dram_idx,
           insn.mem.y_size,
           insn.mem.x_size,
           insn.mem.x_stride);
     }
   } else if (insn.generic.opcode == VTA_OPCODE_GEMM) {
     gemm(raw_copy, uop_mem, acc_mem, inp_mem, wgt_mem, out_mem);
   } else if (insn.generic.opcode == VTA_OPCODE_ALU) {
     alu(raw_copy, uop_mem, acc_mem, inp_mem, wgt_mem, out_mem);
   }

   // Push dependence token if instructed
   if (insn.generic.push_prev_dep) {
     g2l_dep_queue.write(1);
   }
   if (insn.generic.push_next_dep) {
     g2s_dep_queue.write(1);
   }
 }

 void store(
   volatile bus_T *outputs,
   hls::stream<insn_T> &store_queue,
   hls::stream<bool> &g2s_dep_queue,
   hls::stream<bool> &s2g_dep_queue,
   bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) {
 #pragma HLS INTERFACE m_axi port = outputs offset = slave bundle = data_port
 #pragma HLS INTERFACE axis port = store_queue
 #pragma HLS INTERFACE axis port = g2s_dep_queue
 #pragma HLS INTERFACE axis port = s2g_dep_queue
 #pragma HLS INTERFACE bram port = out_mem
 #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
 #pragma HLS RESOURCE variable = out_mem core = RAM_1P

   // Pop store instruction
   insn_T raw_insn = store_queue.read();
   // Cast to MemInsn
   insn_T raw_copy = raw_insn;
   VTAMemInsn insn = *((VTAMemInsn *) &raw_copy);

   // Pop dependence token if instructed
   if (insn.pop_prev_dep) {
     g2s_dep_queue.read();
   }

   // Initialize indices
   memop_sram_T sram_idx = insn.sram_base;
   memop_dram_T dram_idx = insn.dram_base;

   // Copy along y dimension
   for (int y = 0; y < insn.y_size; y++) {
 #pragma HLS PIPELINE
     // Perform data transfer
     memcpy(
       const_cast<bus_T*>(&outputs[dram_idx * OUT_MAT_AXI_RATIO]),
       (const bus_T*) &out_mem[sram_idx][0],
       insn.x_size * VTA_OUT_ELEM_BYTES);
 #pragma HLS RESOURCE variable = sram_idx core = Mul_LUT
     sram_idx += insn.x_size;
     dram_idx += insn.x_stride;
   }

   // Push dependence token if instructed
   if (insn.push_prev_dep) {
     s2g_dep_queue.write(1);
   }
 }

 void vta(
   uint32_t insn_count,
   volatile insn_T *insns,
   volatile uop_T *uops,
   volatile bus_T *inputs,
   volatile bus_T *weights,
   volatile bus_T *biases,
   volatile bus_T *outputs) {
 #pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS
 #pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port
 #pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port
 #pragma HLS INTERFACE m_axi port = inputs offset = slave bundle = data_port
 #pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port
 #pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port
 #pragma HLS INTERFACE m_axi port = outputs offset = slave bundle = data_port
 #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS

   // Instantiate temporary instruction queues (used for peeking)
   hls::stream<insn_T> tmp_load_queue;
   PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=tmp_load_queue)
   hls::stream<insn_T> tmp_gemm_queue;
   PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=tmp_gemm_queue)
   hls::stream<insn_T> tmp_store_queue;
   PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=tmp_store_queue)

   // Instatiate physical instruction queues
   hls::stream<insn_T> load_queue;
   PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=load_queue)
   hls::stream<insn_T> gemm_queue;
   PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=gemm_queue)
   hls::stream<insn_T> store_queue;
   PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=store_queue)

   // Dependence queues
   hls::stream<bool> l2g_dep_queue;
   PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=l2g_dep_queue)
   hls::stream<bool> s2g_dep_queue;
   PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=s2g_dep_queue)
   hls::stream<bool> g2l_dep_queue;
   PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2l_dep_queue)
   hls::stream<bool> g2s_dep_queue;
   PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2s_dep_queue)

   // Instantiate memories
   bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO];
   bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO];
   bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO];

   // Push all instructions into the queues
   fetch(insn_count, insns, tmp_load_queue, tmp_gemm_queue, tmp_store_queue);

   // Global done indicator
   uint32_t done = 0;

   // Temporary instructions
   insn_T tmp_load;
   insn_T tmp_gemv;
   insn_T tmp_store;

   // Peeking status
   bool tmp_load_popped = false;
   bool tmp_gemm_popped = false;
   bool tmp_store_popped = false;
   int exit_counter = 0;

   // Main control loop
   while (true) {
     // First execute as many load instructions as possible
     while (!tmp_load_queue.empty() || tmp_load_popped == true) {
       // Pop the load instruction
       if (!tmp_load_popped) {
         tmp_load_queue.read(tmp_load);
         tmp_load_popped = true;
       }
       // Check dependences and invoke the load stage
       VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_load);
       if ((insn.pop_next_dep && !g2l_dep_queue.empty()) ||
           !insn.pop_next_dep) {
         // Push the instruction in the load queue
         load_queue.write(tmp_load);
         tmp_load_popped = false;
         load(inputs, weights, load_queue, g2l_dep_queue, l2g_dep_queue, inp_mem, wgt_mem);
       } else {
         // Execution of load stage pending on completion of other stages, so break here...
         break;
       }
     }
     // Next execute as many gemm instructions as possible
     while (!tmp_gemm_queue.empty() || tmp_gemm_popped == true) {
       // Pop the gemm instruction
       if (!tmp_gemm_popped) {
         tmp_gemm_queue.read(tmp_gemv);
         tmp_gemm_popped = true;
       }
       // Check dependences and invoke the load stage
       VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_gemv);
       if (
         (insn.pop_prev_dep && !l2g_dep_queue.empty() &&
          insn.pop_next_dep && !s2g_dep_queue.empty()) ||
         (!insn.pop_prev_dep && insn.pop_next_dep &&
          !s2g_dep_queue.empty()) ||
         (insn.pop_prev_dep && !l2g_dep_queue.empty() &&
         !insn.pop_next_dep) ||
         (!insn.pop_prev_dep && !insn.pop_next_dep)
       ) {
         // Push the instruction in the load queue
         gemm_queue.write(tmp_gemv);
         tmp_gemm_popped = false;
         compute(done, uops, biases, gemm_queue, l2g_dep_queue, s2g_dep_queue,
                 g2l_dep_queue, g2s_dep_queue, inp_mem, wgt_mem, out_mem);
       } else {
         // Execution of load stage pending on completion of other stages,
         // so break here...
         break;
       }
     }
     // Finally execute as many store instructions as possible
     while (!tmp_store_queue.empty() || tmp_store_popped == true) {
       // Pop the load instruction
       if (!tmp_store_popped) {
         tmp_store_queue.read(tmp_store);
         tmp_store_popped = true;
       }
       // Check dependences and invoke the load stage
       VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_store);

       if ((insn.pop_prev_dep && !g2s_dep_queue.empty()) ||
           !insn.pop_prev_dep) {
         // Push the instruction in the load queue
         store_queue.write(tmp_store);
         tmp_store_popped = false;
         store(outputs, store_queue, g2s_dep_queue, s2g_dep_queue, out_mem);
       } else {
         // Execution of load stage pending on completion of other stages, so break here...
         break;
       }
     }
     // Check if we get a signal that we are done
     if (done) {
       break;
     }
     exit_counter++;
     if (exit_counter > 1000) {
       if (tmp_load_popped) {
         if (g2l_dep_queue.empty()) {
           printf("waiting on g2l\n");
         }
       }
       if (tmp_gemm_popped) {
         VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_gemv);
         if (l2g_dep_queue.empty() && insn.pop_prev_dep) {
           printf("waiting on l2g\n");
         }
         if (s2g_dep_queue.empty() && insn.pop_next_dep) {
           printf("waiting on s2g\n");
         }
       }
       if (tmp_store_popped) {
         if (g2s_dep_queue.empty()) {
           printf("waiting on g2s\n");
         }
       }
       break;
     }
   }

   // Ensure that the tokens are empty
   bool tmp_tok;
   int l2g_count = 0;
   int s2g_count = 0;
   int g2l_count = 0;
   int g2s_count = 0;
   while (l2g_dep_queue.read_nb(tmp_tok)) {
     l2g_count++;
   }
   while (s2g_dep_queue.read_nb(tmp_tok)) {
     s2g_count++;
   }
   while (g2l_dep_queue.read_nb(tmp_tok)) {
     g2l_count++;
   }
   while (g2s_dep_queue.read_nb(tmp_tok)) {
     g2s_count++;
   }

   assert(l2g_count == 0 && s2g_count == 0 && g2l_count == 0 && g2s_count == 0);
 }