| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| /*! |
| * \file vta.cpp |
| * \brief VTA HLS design. |
| */ |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include "vta.h" |
| |
| template <typename DATA_T, int MAT_AXI_RATIO> |
| void reset_mem( |
| memop_sram_T &sram_idx, |
| memop_sram_T range, |
| DATA_T mem[][MAT_AXI_RATIO]) { |
| |
| for (int i = 0; i < range; i ++) { |
| for (int j = 0; j < MAT_AXI_RATIO; j ++) { |
| #pragma HLS UNROLL |
| mem[sram_idx][j] = 0; |
| } |
| sram_idx ++; |
| } |
| } |
| |
| template <typename DATA_T, int MAT_AXI_RATIO, int ELEM_BYTES> |
| void load_pad_2d( |
| volatile DATA_T *src, |
| DATA_T dst[][MAT_AXI_RATIO], |
| memop_sram_T sram_idx, |
| memop_dram_T dram_idx, |
| memop_size_T y_size, |
| memop_size_T x_size, |
| memop_stride_T x_stride, |
| memop_pad_T x_pad_0, |
| memop_pad_T x_pad_1, |
| memop_sram_T y_offset_0, |
| memop_sram_T y_offset_1) { |
| #pragma HLS INLINE |
| |
| reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, y_offset_0, dst); |
| for (int y = 0; y < y_size; y++) { |
| #pragma HLS PIPELINE |
| reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, x_pad_0, dst); |
| memcpy(&dst[sram_idx][0], |
| (const DATA_T*) &src[dram_idx * MAT_AXI_RATIO], |
| x_size * ELEM_BYTES); |
| sram_idx += x_size; |
| dram_idx += x_stride; |
| reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, x_pad_1, dst); |
| } |
| reset_mem<DATA_T, MAT_AXI_RATIO>(sram_idx, y_offset_1, dst); |
| } |
| |
| template <typename DATA_T, int MAT_AXI_RATIO, int ELEM_BYTES> |
| void load_2d( |
| volatile DATA_T *src, |
| DATA_T dst[][MAT_AXI_RATIO], |
| memop_sram_T sram_idx, |
| memop_dram_T dram_idx, |
| memop_size_T y_size, |
| memop_size_T x_size, |
| memop_stride_T x_stride) { |
| #pragma HLS INLINE |
| |
| for (int y = 0; y < y_size; y++) { |
| memcpy(&dst[sram_idx][0], |
| (const DATA_T*) &src[dram_idx * MAT_AXI_RATIO], |
| x_size * ELEM_BYTES); |
| #pragma HLS RESOURCE variable = sram_idx core = Mul_LUT |
| sram_idx += x_size; |
| dram_idx += x_stride; |
| } |
| } |
| |
| template <typename WIDE_T, typename NARROW_T, typename IDX_T, int WIDE_W, int NARROW_W, int Y_DIM, int X_DIM> |
| void read_tensor( |
| IDX_T idx, |
| WIDE_T src[][NARROW_W * Y_DIM * X_DIM / WIDE_W], |
| NARROW_T dst[Y_DIM][X_DIM]) { |
| #pragma HLS INLINE |
| |
| // Read in weight tensor |
| for (int p = 0; p < NARROW_W * Y_DIM * X_DIM / WIDE_W; p++) { |
| WIDE_T packet = src[idx][p]; |
| for (int w = 0; w < (WIDE_W / NARROW_W); w++) { |
| int x = (p * (WIDE_W / NARROW_W) + w) / X_DIM; |
| int y = (p * (WIDE_W / NARROW_W) + w) % X_DIM; |
| dst[x][y] = (NARROW_T) packet.range((w + 1) * NARROW_W - 1, w * NARROW_W); |
| } |
| } |
| } |
| |
| template <typename WIDE_T, typename NARROW_T, typename IDX_T, int WIDE_W, int NARROW_W, int Y_DIM, int X_DIM> |
| void write_tensor( |
| IDX_T idx, |
| NARROW_T src[Y_DIM][X_DIM], |
| WIDE_T dst[][NARROW_W * Y_DIM * X_DIM / WIDE_W]) { |
| #pragma HLS INLINE |
| |
| for (int p = 0; p < NARROW_W * Y_DIM * X_DIM / WIDE_W; p++) { |
| WIDE_T packet = 0; |
| for (int w = 0; w < (WIDE_W / NARROW_W); w++) { |
| int x = (p * (WIDE_W / NARROW_W) + w) / X_DIM; |
| int y = (p * (WIDE_W / NARROW_W) + w) % X_DIM; |
| packet.range((w + 1) * NARROW_W - 1, w * NARROW_W) = src[x][y]; |
| } |
| dst[idx][p] = packet; |
| } |
| } |
| |
| void fetch( |
| uint32_t insn_count, |
| volatile insn_T *insns, |
| hls::stream<insn_T> &load_queue, |
| hls::stream<insn_T> &gemm_queue, |
| hls::stream<insn_T> &store_queue) { |
| PRAGMA_HLS(HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS offset = VTA_FETCH_INSN_COUNT_OFFSET) |
| #pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port |
| #pragma HLS INTERFACE axis port = load_queue |
| #pragma HLS INTERFACE axis port = gemm_queue |
| #pragma HLS INTERFACE axis port = store_queue |
| #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS |
| |
| INSN_DECODE: for (int pc = 0; pc < insn_count; pc++) { |
| #pragma HLS PIPELINE |
| // Read instruction fields |
| insn_T raw_insn = insns[pc]; |
| VTAInsn insn; |
| insn.generic = *((VTAGenericInsn *) &raw_insn); |
| // Do some partial decoding |
| opcode_T opcode = insn.generic.opcode; |
| memop_id_T memory_type = insn.mem.memory_type; |
| // Push to appropriate instruction queue |
| if (opcode == VTA_OPCODE_STORE) { |
| store_queue.write(raw_insn); |
| } else if (opcode == VTA_OPCODE_LOAD) { |
| if (memory_type == VTA_MEM_ID_INP || memory_type == VTA_MEM_ID_WGT) { |
| load_queue.write(raw_insn); |
| } else { |
| gemm_queue.write(raw_insn); |
| } |
| } else { |
| gemm_queue.write(raw_insn); |
| } |
| } |
| } |
| |
| void load( |
| volatile bus_T *inputs, |
| volatile bus_T *weights, |
| hls::stream<insn_T> &load_queue, |
| hls::stream<bool> &g2l_dep_queue, |
| hls::stream<bool> &l2g_dep_queue, |
| bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO], |
| bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO]) { |
| #pragma HLS INTERFACE m_axi port = inputs offset = slave bundle = data_port |
| #pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port |
| #pragma HLS INTERFACE axis port = load_queue |
| #pragma HLS INTERFACE axis port = g2l_dep_queue |
| #pragma HLS INTERFACE axis port = l2g_dep_queue |
| #pragma HLS INTERFACE bram port = wgt_mem |
| #pragma HLS INTERFACE bram port = inp_mem |
| #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS |
| #pragma HLS RESOURCE variable = inp_mem core = RAM_1P |
| #pragma HLS RESOURCE variable = wgt_mem core = RAM_1P |
| |
| // Pop load instruction |
| insn_T raw_insn = load_queue.read(); |
| // Cast to MemInsn |
| insn_T raw_copy = raw_insn; |
| VTAMemInsn insn = *((VTAMemInsn *) &raw_copy); |
| |
| // Pop dependence token if instructed |
| if (insn.pop_next_dep) { |
| g2l_dep_queue.read(); |
| } |
| |
| // Pre-processing |
| memop_sram_T x_width = (insn.x_pad_0 + insn.x_size + insn.x_pad_1); |
| memop_sram_T y_offset_0 = x_width * insn.y_pad_0; |
| #pragma HLS RESOURCE variable = y_offset_0 core = Mul_LUT latency = 4 |
| memop_sram_T y_offset_1 = x_width * insn.y_pad_1; |
| #pragma HLS RESOURCE variable = y_offset_1 core = Mul_LUT latency = 4 |
| |
| if (insn.memory_type == VTA_MEM_ID_INP) { |
| load_pad_2d<bus_T, INP_MAT_AXI_RATIO, VTA_INP_ELEM_BYTES>( |
| inputs, |
| inp_mem, |
| insn.sram_base, |
| insn.dram_base, |
| insn.y_size, |
| insn.x_size, |
| insn.x_stride, |
| insn.x_pad_0, |
| insn.x_pad_1, |
| y_offset_0, |
| y_offset_1); |
| } else if (insn.memory_type == VTA_MEM_ID_WGT) { |
| load_2d<bus_T, WGT_MAT_AXI_RATIO, VTA_WGT_ELEM_BYTES>( |
| weights, |
| wgt_mem, |
| insn.sram_base, |
| insn.dram_base, |
| insn.y_size, |
| insn.x_size, |
| insn.x_stride); |
| } |
| |
| // Push dependence token if instructed |
| if (insn.push_next_dep) { |
| l2g_dep_queue.write(1); |
| } |
| } |
| |
| void gemm( |
| insn_T insn_raw, |
| uop_T uop_mem[VTA_UOP_BUFF_DEPTH], |
| bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO], |
| bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO], |
| bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO], |
| bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) { |
| #pragma HLS INLINE |
| |
| VTAGemInsn insn = *((VTAGemInsn *) &insn_raw); |
| |
| // Loop offset |
| acc_idx_T dst_offset_out = 0; |
| inp_idx_T src_offset_out = 0; |
| wgt_idx_T wgt_offset_out = 0; |
| |
| // Outer Loop |
| EXE_OUT_LOOP: for (int it_out = 0; it_out < insn.iter_out; it_out++) { |
| acc_idx_T dst_offset_in = dst_offset_out; |
| inp_idx_T src_offset_in = src_offset_out; |
| wgt_idx_T wgt_offset_in = wgt_offset_out; |
| |
| // Inner Loop |
| EXE_IN_LOOP: for (int it_in = 0; it_in < insn.iter_in; it_in++) { |
| |
| // Iterate over micro op |
| READ_GEMM_UOP: for (int upc = insn.uop_bgn; upc < insn.uop_end; upc++) { |
| #pragma HLS PIPELINE II = 1 |
| // Read micro-op fields |
| uop_T uop = uop_mem[upc]; |
| |
| // Decode indices |
| acc_idx_T dst_idx = |
| uop.range(VTA_UOP_GEM_0_1, VTA_UOP_GEM_0_0) + dst_offset_in; |
| inp_idx_T src_idx = |
| uop.range(VTA_UOP_GEM_1_1, VTA_UOP_GEM_1_0) + src_offset_in; |
| wgt_idx_T wgt_idx = |
| uop.range(VTA_UOP_GEM_2_1, VTA_UOP_GEM_2_0) + wgt_offset_in; |
| |
| // Read in weight tensor |
| wgt_T w_tensor[VTA_BLOCK_OUT][VTA_BLOCK_IN]; |
| read_tensor<bus_T, wgt_T, wgt_idx_T, VTA_BUS_WIDTH, VTA_WGT_WIDTH, VTA_BLOCK_OUT, VTA_BLOCK_IN>(wgt_idx, wgt_mem, w_tensor); |
| // Read in input tensor |
| inp_T i_tensor[VTA_BATCH][VTA_BLOCK_IN]; |
| read_tensor<bus_T, inp_T, inp_idx_T, VTA_BUS_WIDTH, VTA_INP_WIDTH, VTA_BATCH, VTA_BLOCK_IN>(src_idx, inp_mem, i_tensor); |
| // Read in accum tensor |
| acc_T a_tensor[VTA_BATCH][VTA_BLOCK_OUT]; |
| read_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, acc_mem, a_tensor); |
| // Output tensor |
| out_T o_tensor[VTA_BATCH][VTA_BLOCK_OUT]; |
| |
| // Inner GEMM loop |
| for (int b = 0; b < VTA_BATCH; b++) { |
| for (int oc = 0; oc < VTA_BLOCK_OUT; oc++) { |
| // Initialize the accumulator values |
| acc_T accum = a_tensor[b][oc]; |
| // Dot product sum |
| sum_T tmp = 0; |
| // Inner matrix multiplication loop (input channel/feature) |
| for (int ic = 0; ic < VTA_BLOCK_IN; ic++) { |
| wgt_T w_elem = w_tensor[oc][ic]; |
| inp_T i_elem = i_tensor[b][ic]; |
| mul_T prod_dsp = i_elem * w_elem; |
| tmp += (sum_T) prod_dsp; |
| } |
| // Update summation |
| accum += (acc_T) tmp; |
| // Write back result acc_mem |
| a_tensor[b][oc] = insn.reset_reg ? (acc_T) 0 : accum; |
| // And output vector |
| o_tensor[b][oc] = (out_T) accum.range(VTA_OUT_WIDTH - 1, 0); |
| } |
| } |
| |
| // Write the results back into accumulator |
| write_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, a_tensor, acc_mem); |
| // Write the results back in the output buffer |
| write_tensor<bus_T, out_T, acc_idx_T, VTA_BUS_WIDTH, VTA_OUT_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, o_tensor, out_mem); |
| } |
| // Update offsets |
| dst_offset_in += insn.dst_factor_in; |
| src_offset_in += insn.src_factor_in; |
| wgt_offset_in += insn.wgt_factor_in; |
| } |
| // Update offsets |
| dst_offset_out += insn.dst_factor_out; |
| src_offset_out += insn.src_factor_out; |
| wgt_offset_out += insn.wgt_factor_out; |
| } |
| } |
| |
| void alu( |
| insn_T insn_raw, |
| uop_T uop_mem[VTA_UOP_BUFF_DEPTH], |
| bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO], |
| bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO], |
| bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO], |
| bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) { |
| #pragma HLS INLINE |
| |
| VTAAluInsn insn = *((VTAAluInsn *) &insn_raw); |
| |
| // Loop offset |
| acc_idx_T dst_offset_out = 0; |
| inp_idx_T src_offset_out = 0; |
| |
| // Outer Loop |
| EXE_OUT_LOOP: for (int it_out = 0; it_out < insn.iter_out; it_out++) { |
| acc_idx_T dst_offset_in = dst_offset_out; |
| inp_idx_T src_offset_in = src_offset_out; |
| |
| // Inner Loop |
| EXE_IN_LOOP: for (int it_in = 0; it_in < insn.iter_in; it_in++) { |
| // Iterate over micro op |
| READ_ALU_UOP: for (int upc = insn.uop_bgn; upc < insn.uop_end; upc++) { |
| #pragma HLS PIPELINE II = 2 |
| // Read micro-op fields |
| uop_T uop = uop_mem[upc]; |
| |
| // Decode |
| acc_idx_T dst_idx = |
| uop.range(VTA_UOP_ALU_0_1, VTA_UOP_ALU_0_0) + dst_offset_in; |
| acc_idx_T src_idx = |
| uop.range(VTA_UOP_ALU_1_1, VTA_UOP_ALU_1_0) + src_offset_in; |
| |
| // Read in src tensor |
| acc_T src_tensor[VTA_BATCH][VTA_BLOCK_OUT]; |
| read_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(src_idx, acc_mem, src_tensor); |
| // Read in dst tensor |
| acc_T dst_tensor[VTA_BATCH][VTA_BLOCK_OUT]; |
| read_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, acc_mem, dst_tensor); |
| // Output tensor |
| out_T o_tensor[VTA_BATCH][VTA_BLOCK_OUT]; |
| |
| // Perform ALU op over matrix elements |
| for (int i = 0; i < VTA_BATCH; i++) { |
| for (int b = 0; b < VTA_BLOCK_OUT; b++) { |
| // Read in operands |
| acc_T src_0 = dst_tensor[i][b]; |
| acc_T src_1 = insn.use_imm ? (acc_T) insn.imm : src_tensor[i][b]; |
| aluop_shr_arg_T shft_by = src_1.range(VTA_SHR_ARG_BIT_WIDTH - 1, 0); |
| aluop_mul_arg_T mul_by = src_1.range(VTA_MUL_ARG_BIT_WIDTH - 1, 0); |
| if (insn.alu_opcode == VTA_ALU_OPCODE_MIN || insn.alu_opcode == VTA_ALU_OPCODE_MAX) { |
| // Compute Min/Max |
| acc_T mix_val = src_0 < src_1 ? |
| (insn.alu_opcode == VTA_ALU_OPCODE_MIN ? src_0 : src_1) : |
| (insn.alu_opcode == VTA_ALU_OPCODE_MIN ? src_1 : src_0); |
| dst_tensor[i][b] = mix_val; |
| o_tensor[i][b] = (out_T) mix_val.range(VTA_OUT_WIDTH - 1, 0); |
| } else if (insn.alu_opcode == VTA_ALU_OPCODE_ADD) { |
| // Compute Sum |
| acc_T add_val = |
| src_0.range(VTA_ACC_WIDTH - 1, 0) + src_1.range(VTA_ACC_WIDTH - 1, 0); |
| dst_tensor[i][b] = add_val; |
| o_tensor[i][b] = (out_T) add_val.range(VTA_OUT_WIDTH - 1, 0); |
| } else if (insn.alu_opcode == VTA_ALU_OPCODE_SHR) { |
| // Compute Shift Right |
| acc_T shr_val = src_0 >> shft_by; |
| dst_tensor[i][b] = shr_val; |
| o_tensor[i][b] = (out_T) shr_val.range(VTA_OUT_WIDTH - 1, 0); |
| } |
| } |
| } |
| |
| // Write the results back into accumulator |
| write_tensor<bus_T, acc_T, acc_idx_T, VTA_BUS_WIDTH, VTA_ACC_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, dst_tensor, acc_mem); |
| // Write the results back in the output buffer |
| write_tensor<bus_T, out_T, acc_idx_T, VTA_BUS_WIDTH, VTA_OUT_WIDTH, VTA_BATCH, VTA_BLOCK_OUT>(dst_idx, o_tensor, out_mem); |
| } |
| // Update offsets |
| dst_offset_in += insn.dst_factor_in; |
| src_offset_in += insn.src_factor_in; |
| } |
| // Update offsets |
| dst_offset_out += insn.dst_factor_out; |
| src_offset_out += insn.src_factor_out; |
| } |
| } |
| |
| void compute( |
| volatile uint32_t &done, |
| volatile uop_T *uops, |
| volatile bus_T *biases, |
| hls::stream<insn_T> &gemm_queue, |
| hls::stream<bool> &l2g_dep_queue, |
| hls::stream<bool> &s2g_dep_queue, |
| hls::stream<bool> &g2l_dep_queue, |
| hls::stream<bool> &g2s_dep_queue, |
| bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO], |
| bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO], |
| bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) { |
| PRAGMA_HLS(HLS INTERFACE s_axilite port = done bundle = CONTROL_BUS offset = VTA_COMPUTE_DONE_WR_OFFSET) |
| #pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port |
| #pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port |
| #pragma HLS INTERFACE axis port = gemm_queue |
| #pragma HLS INTERFACE axis port = l2g_dep_queue |
| #pragma HLS INTERFACE axis port = s2g_dep_queue |
| #pragma HLS INTERFACE axis port = g2l_dep_queue |
| #pragma HLS INTERFACE axis port = g2s_dep_queue |
| #pragma HLS INTERFACE bram port = inp_mem |
| #pragma HLS INTERFACE bram port = wgt_mem |
| #pragma HLS INTERFACE bram port = out_mem |
| #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS |
| #pragma HLS RESOURCE variable = inp_mem core = RAM_1P |
| #pragma HLS RESOURCE variable = wgt_mem core = RAM_1P |
| #pragma HLS RESOURCE variable = out_mem core = RAM_1P |
| |
| // Micro-op storage |
| static uop_T uop_mem[VTA_UOP_BUFF_DEPTH]; |
| |
| // Accumulator storage |
| static bus_T acc_mem[VTA_ACC_BUFF_DEPTH][ACC_MAT_AXI_RATIO]; |
| #pragma HLS ARRAY_RESHAPE variable = acc_mem complete dim=2 |
| // This is necessary to obtain II=1 |
| #pragma HLS DEPENDENCE variable = acc_mem inter false |
| |
| // Pop GEMM instruction |
| insn_T raw_insn = gemm_queue.read(); |
| // Cast to GenericInsn |
| VTAInsn insn; |
| insn_T raw_copy = raw_insn; |
| insn.generic = *((VTAGenericInsn *) &raw_copy); |
| |
| // Pop dependence token if instructed |
| if (insn.generic.pop_prev_dep) { |
| l2g_dep_queue.read(); |
| } |
| if (insn.generic.pop_next_dep) { |
| s2g_dep_queue.read(); |
| } |
| |
| // Set done value |
| done = 0; |
| // Perform action based on opcode |
| if (insn.generic.opcode == VTA_OPCODE_FINISH) { |
| // Set done flag if we reach a FINISH instruction |
| done = 1; |
| } else if (insn.generic.opcode == VTA_OPCODE_LOAD) { |
| // Initialize indices |
| memop_sram_T sram_idx = insn.mem.sram_base; |
| memop_dram_T dram_idx = insn.mem.dram_base; |
| if (insn.mem.memory_type == VTA_MEM_ID_UOP) { |
| // Perform data transfer |
| memcpy(&uop_mem[sram_idx], |
| (const uop_T*) &uops[dram_idx], |
| insn.mem.x_size * sizeof(uop_T)); |
| } else if (insn.mem.memory_type == VTA_MEM_ID_ACC) { |
| // Perform data transfer from DRAM |
| load_2d<bus_T, ACC_MAT_AXI_RATIO, VTA_ACC_ELEM_BYTES>( |
| biases, |
| acc_mem, |
| sram_idx, |
| dram_idx, |
| insn.mem.y_size, |
| insn.mem.x_size, |
| insn.mem.x_stride); |
| } |
| } else if (insn.generic.opcode == VTA_OPCODE_GEMM) { |
| gemm(raw_copy, uop_mem, acc_mem, inp_mem, wgt_mem, out_mem); |
| } else if (insn.generic.opcode == VTA_OPCODE_ALU) { |
| alu(raw_copy, uop_mem, acc_mem, inp_mem, wgt_mem, out_mem); |
| } |
| |
| // Push dependence token if instructed |
| if (insn.generic.push_prev_dep) { |
| g2l_dep_queue.write(1); |
| } |
| if (insn.generic.push_next_dep) { |
| g2s_dep_queue.write(1); |
| } |
| } |
| |
| void store( |
| volatile bus_T *outputs, |
| hls::stream<insn_T> &store_queue, |
| hls::stream<bool> &g2s_dep_queue, |
| hls::stream<bool> &s2g_dep_queue, |
| bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]) { |
| #pragma HLS INTERFACE m_axi port = outputs offset = slave bundle = data_port |
| #pragma HLS INTERFACE axis port = store_queue |
| #pragma HLS INTERFACE axis port = g2s_dep_queue |
| #pragma HLS INTERFACE axis port = s2g_dep_queue |
| #pragma HLS INTERFACE bram port = out_mem |
| #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS |
| #pragma HLS RESOURCE variable = out_mem core = RAM_1P |
| |
| // Pop store instruction |
| insn_T raw_insn = store_queue.read(); |
| // Cast to MemInsn |
| insn_T raw_copy = raw_insn; |
| VTAMemInsn insn = *((VTAMemInsn *) &raw_copy); |
| |
| // Pop dependence token if instructed |
| if (insn.pop_prev_dep) { |
| g2s_dep_queue.read(); |
| } |
| |
| // Initialize indices |
| memop_sram_T sram_idx = insn.sram_base; |
| memop_dram_T dram_idx = insn.dram_base; |
| |
| // Copy along y dimension |
| for (int y = 0; y < insn.y_size; y++) { |
| #pragma HLS PIPELINE |
| // Perform data transfer |
| memcpy( |
| const_cast<bus_T*>(&outputs[dram_idx * OUT_MAT_AXI_RATIO]), |
| (const bus_T*) &out_mem[sram_idx][0], |
| insn.x_size * VTA_OUT_ELEM_BYTES); |
| #pragma HLS RESOURCE variable = sram_idx core = Mul_LUT |
| sram_idx += insn.x_size; |
| dram_idx += insn.x_stride; |
| } |
| |
| // Push dependence token if instructed |
| if (insn.push_prev_dep) { |
| s2g_dep_queue.write(1); |
| } |
| } |
| |
| void vta( |
| uint32_t insn_count, |
| volatile insn_T *insns, |
| volatile uop_T *uops, |
| volatile bus_T *inputs, |
| volatile bus_T *weights, |
| volatile bus_T *biases, |
| volatile bus_T *outputs) { |
| #pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS |
| #pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port |
| #pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port |
| #pragma HLS INTERFACE m_axi port = inputs offset = slave bundle = data_port |
| #pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port |
| #pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port |
| #pragma HLS INTERFACE m_axi port = outputs offset = slave bundle = data_port |
| #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS |
| |
| // Instantiate temporary instruction queues (used for peeking) |
| hls::stream<insn_T> tmp_load_queue; |
| PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=tmp_load_queue) |
| hls::stream<insn_T> tmp_gemm_queue; |
| PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=tmp_gemm_queue) |
| hls::stream<insn_T> tmp_store_queue; |
| PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=tmp_store_queue) |
| |
| // Instatiate physical instruction queues |
| hls::stream<insn_T> load_queue; |
| PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=load_queue) |
| hls::stream<insn_T> gemm_queue; |
| PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=gemm_queue) |
| hls::stream<insn_T> store_queue; |
| PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=store_queue) |
| |
| // Dependence queues |
| hls::stream<bool> l2g_dep_queue; |
| PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=l2g_dep_queue) |
| hls::stream<bool> s2g_dep_queue; |
| PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=s2g_dep_queue) |
| hls::stream<bool> g2l_dep_queue; |
| PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2l_dep_queue) |
| hls::stream<bool> g2s_dep_queue; |
| PRAGMA_HLS(HLS stream depth=STREAM_IN_DEPTH variable=g2s_dep_queue) |
| |
| // Instantiate memories |
| bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO]; |
| bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO]; |
| bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]; |
| |
| // Push all instructions into the queues |
| fetch(insn_count, insns, tmp_load_queue, tmp_gemm_queue, tmp_store_queue); |
| |
| // Global done indicator |
| uint32_t done = 0; |
| |
| // Temporary instructions |
| insn_T tmp_load; |
| insn_T tmp_gemv; |
| insn_T tmp_store; |
| |
| // Peeking status |
| bool tmp_load_popped = false; |
| bool tmp_gemm_popped = false; |
| bool tmp_store_popped = false; |
| int exit_counter = 0; |
| |
| // Main control loop |
| while (true) { |
| // First execute as many load instructions as possible |
| while (!tmp_load_queue.empty() || tmp_load_popped == true) { |
| // Pop the load instruction |
| if (!tmp_load_popped) { |
| tmp_load_queue.read(tmp_load); |
| tmp_load_popped = true; |
| } |
| // Check dependences and invoke the load stage |
| VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_load); |
| if ((insn.pop_next_dep && !g2l_dep_queue.empty()) || |
| !insn.pop_next_dep) { |
| // Push the instruction in the load queue |
| load_queue.write(tmp_load); |
| tmp_load_popped = false; |
| load(inputs, weights, load_queue, g2l_dep_queue, l2g_dep_queue, inp_mem, wgt_mem); |
| } else { |
| // Execution of load stage pending on completion of other stages, so break here... |
| break; |
| } |
| } |
| // Next execute as many gemm instructions as possible |
| while (!tmp_gemm_queue.empty() || tmp_gemm_popped == true) { |
| // Pop the gemm instruction |
| if (!tmp_gemm_popped) { |
| tmp_gemm_queue.read(tmp_gemv); |
| tmp_gemm_popped = true; |
| } |
| // Check dependences and invoke the load stage |
| VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_gemv); |
| if ( |
| (insn.pop_prev_dep && !l2g_dep_queue.empty() && |
| insn.pop_next_dep && !s2g_dep_queue.empty()) || |
| (!insn.pop_prev_dep && insn.pop_next_dep && |
| !s2g_dep_queue.empty()) || |
| (insn.pop_prev_dep && !l2g_dep_queue.empty() && |
| !insn.pop_next_dep) || |
| (!insn.pop_prev_dep && !insn.pop_next_dep) |
| ) { |
| // Push the instruction in the load queue |
| gemm_queue.write(tmp_gemv); |
| tmp_gemm_popped = false; |
| compute(done, uops, biases, gemm_queue, l2g_dep_queue, s2g_dep_queue, |
| g2l_dep_queue, g2s_dep_queue, inp_mem, wgt_mem, out_mem); |
| } else { |
| // Execution of load stage pending on completion of other stages, |
| // so break here... |
| break; |
| } |
| } |
| // Finally execute as many store instructions as possible |
| while (!tmp_store_queue.empty() || tmp_store_popped == true) { |
| // Pop the load instruction |
| if (!tmp_store_popped) { |
| tmp_store_queue.read(tmp_store); |
| tmp_store_popped = true; |
| } |
| // Check dependences and invoke the load stage |
| VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_store); |
| |
| if ((insn.pop_prev_dep && !g2s_dep_queue.empty()) || |
| !insn.pop_prev_dep) { |
| // Push the instruction in the load queue |
| store_queue.write(tmp_store); |
| tmp_store_popped = false; |
| store(outputs, store_queue, g2s_dep_queue, s2g_dep_queue, out_mem); |
| } else { |
| // Execution of load stage pending on completion of other stages, so break here... |
| break; |
| } |
| } |
| // Check if we get a signal that we are done |
| if (done) { |
| break; |
| } |
| exit_counter++; |
| if (exit_counter > 1000) { |
| if (tmp_load_popped) { |
| if (g2l_dep_queue.empty()) { |
| printf("waiting on g2l\n"); |
| } |
| } |
| if (tmp_gemm_popped) { |
| VTAGenericInsn insn = *((VTAGenericInsn *) &tmp_gemv); |
| if (l2g_dep_queue.empty() && insn.pop_prev_dep) { |
| printf("waiting on l2g\n"); |
| } |
| if (s2g_dep_queue.empty() && insn.pop_next_dep) { |
| printf("waiting on s2g\n"); |
| } |
| } |
| if (tmp_store_popped) { |
| if (g2s_dep_queue.empty()) { |
| printf("waiting on g2s\n"); |
| } |
| } |
| break; |
| } |
| } |
| |
| // Ensure that the tokens are empty |
| bool tmp_tok; |
| int l2g_count = 0; |
| int s2g_count = 0; |
| int g2l_count = 0; |
| int g2s_count = 0; |
| while (l2g_dep_queue.read_nb(tmp_tok)) { |
| l2g_count++; |
| } |
| while (s2g_dep_queue.read_nb(tmp_tok)) { |
| s2g_count++; |
| } |
| while (g2l_dep_queue.read_nb(tmp_tok)) { |
| g2l_count++; |
| } |
| while (g2s_dep_queue.read_nb(tmp_tok)) { |
| g2s_count++; |
| } |
| |
| assert(l2g_count == 0 && s2g_count == 0 && g2l_count == 0 && g2s_count == 0); |
| } |