blob: ae4adf91d7dba0e744b273b22c7da5c6310c93b2 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file test_lib.cpp
* \brief Test library for the VTA design simulation and driver tests.
*/
#include "test_lib.h"
#ifdef NO_SIM
#ifdef VTA_TARGET_PYNQ
uint64_t vta(
uint32_t insn_count,
VTAGenericInsn *insns,
VTAUop *uops,
uint32_t *inputs,
uint32_t *weights,
uint32_t *biases,
uint32_t *outputs) {
// Performance counter variables
uint64_t t_fpga;
struct timespec start, stop;
// Derive bitstream file
char bitstream[128];
char str_batch_size[4];
char str_block_out_size[4];
char str_block_in_size[4];
char str_block_bit_width[4];
snprintf(str_batch_size, sizeof(str_batch_size), "%d", VTA_BATCH);
snprintf(str_block_out_size, sizeof(str_block_out_size), "%d", VTA_BLOCK_OUT);
snprintf(str_block_in_size, sizeof(str_block_in_size), "%d", VTA_BLOCK_IN);
snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH);
snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit");
// Get VTA handles
void* vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR);
void* vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR);
void* vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR);
void* vta_store_handle = VTAMapRegister(VTA_STORE_ADDR);
// Physical address pointers
uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
#if VTA_DEBUG == 1
printf("INFO - Starting FPGA!\n");
#endif
clock_gettime(CLOCK_REALTIME, &start);
VTAWriteMappedReg(vta_fetch_handle, VTA_FETCH_INSN_COUNT_OFFSET, insn_count);
if (insns) VTAWriteMappedReg(vta_fetch_handle, VTA_FETCH_INSN_ADDR_OFFSET, insn_phy);
if (inputs) VTAWriteMappedReg(vta_load_handle, VTA_LOAD_INP_ADDR_OFFSET, input_phy);
if (weights) VTAWriteMappedReg(vta_load_handle, VTA_LOAD_WGT_ADDR_OFFSET, weight_phy);
if (uops) VTAWriteMappedReg(vta_compute_handle, VTA_COMPUTE_UOP_ADDR_OFFSET, uop_phy);
if (biases) VTAWriteMappedReg(vta_compute_handle, VTA_COMPUTE_BIAS_ADDR_OFFSET, bias_phy);
if (outputs) VTAWriteMappedReg(vta_store_handle, VTA_STORE_OUT_ADDR_OFFSET, output_phy);
// VTA start
VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);
int flag = 0, t = 0;
for (t = 0; t < 10000000; ++t) {
flag = VTAReadMappedReg(vta_compute_handle, VTA_COMPUTE_DONE_RD_OFFSET);
if (flag & VTA_DONE) break;
}
if (t == 10000000) {
printf("\tWARNING: VTA TIMEOUT!!!!\n");
#if VTA_DEBUG == 1
} else {
printf("INFO - FPGA Finished!\n");
#endif
}
clock_gettime(CLOCK_REALTIME, &stop);
t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
// Unmap VTA register
VTAUnmapRegister(vta_fetch_handle);
VTAUnmapRegister(vta_load_handle);
VTAUnmapRegister(vta_compute_handle);
VTAUnmapRegister(vta_store_handle);
return t_fpga;
}
#endif // VTA_TARGET_PYNQ
#endif // NO_SIM
uint32_t globalSeed;
const char* getOpcodeString(int opcode, bool use_imm) {
// Returns string name
if (opcode == VTA_ALU_OPCODE_MIN) {
if (use_imm) {
return "min imm";
} else {
return "min";
}
} else if (opcode == VTA_ALU_OPCODE_MAX) {
if (use_imm) {
return "max imm";
} else {
return "max";
}
} else if (opcode == VTA_ALU_OPCODE_ADD) {
if (use_imm) {
return "add imm";
} else {
return "add";
}
} else if (opcode == VTA_ALU_OPCODE_SHR) {
return "shr";
}
// else if (opcode == VTA_ALU_OPCODE_MUL) {
// return "mul";
// }
return "unknown op";
}
template <typename DST_T, int DST_T_WIDTH, typename SRC_T, int SRC_T_WIDTH>
void packBuffer(DST_T *dst, SRC_T **src, int y_size, int x_size, int y_block, int x_block) {
assert((SRC_T_WIDTH * x_block * y_block) % DST_T_WIDTH == 0);
assert(DST_T_WIDTH <= 64);
int buffer_idx = 0;
int ratio = DST_T_WIDTH / SRC_T_WIDTH;
long long int mask = (1ULL << SRC_T_WIDTH) - 1;
DST_T tmp = 0;
for (int i = 0; i < y_size / y_block; i++) {
for (int j = 0; j < x_size / x_block; j++) {
for (int k = 0; k < y_block; k++) {
for (int l = 0; l < x_block; l++) {
int block_idx = l + k * x_block;
tmp |= (src[i * y_block + k][j * x_block + l] & mask) << ((block_idx % ratio) * SRC_T_WIDTH);
// When tmp is packed, write to destination array
if (block_idx % ratio == ratio - 1) {
dst[buffer_idx++] = tmp;
tmp = 0;
}
}
}
}
}
}
template <typename DST_T, int DST_T_WIDTH, typename SRC_T, int SRC_T_WIDTH>
void unpackBuffer(DST_T **dst, SRC_T *src, int y_size, int x_size, int y_block, int x_block) {
assert((DST_T_WIDTH * x_block * y_block) % SRC_T_WIDTH == 0);
int buffer_idx = 0;
long long int mask = (1ULL << DST_T_WIDTH) - 1;
int ratio = SRC_T_WIDTH / DST_T_WIDTH;
for (int i = 0; i < y_size / y_block; i++) {
for (int j = 0; j < x_size / x_block; j++) {
for (int k = 0; k < y_block; k++) {
for (int l = 0; l < x_block; l++) {
int block_idx = l + k * x_block;
dst[i * y_block + k][j * x_block + l] = (src[buffer_idx] >> ((block_idx % ratio) * DST_T_WIDTH)) & mask;
if (block_idx % ratio == ratio - 1) {
buffer_idx++;
}
}
}
}
}
}
template <typename T>
T ** allocInit2dArray(int rows, int cols) {
// Allocate
T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
for (int i = 0; i < rows; i++) {
array[i] = static_cast<T *>(malloc(sizeof(T) * cols));
}
// Init
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
array[i][j] = static_cast<T>(rand_r(&globalSeed));
}
}
return array;
}
template <typename T>
T ** allocSet2dArray(int rows, int cols, int val) {
// Allocate
T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
for (int i = 0; i < rows; i++) {
array[i] = static_cast<T *>(malloc(sizeof(T) * cols));
}
// Init
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
array[i][j] = static_cast<T>(val);
}
}
return array;
}
template <typename T>
T ** alloc2dArray(int rows, int cols) {
T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
for (int i = 0; i < rows; i++) {
array[i] = static_cast<T *>(malloc(sizeof(T) * cols));
}
return array;
}
template <typename T>
void free2dArray(T **array, int rows, int cols) {
for (int i = 0; i < rows; i++) {
free(array[i]);
}
free(array);
}
template <typename T>
T *** alloc3dArray(int rows, int cols, int depth) {
T ***array = static_cast<T ***>(malloc(sizeof(T **) * rows));
for (int i = 0; i < rows; i++) {
array[i] = static_cast<T **>(malloc(sizeof(T *) * cols));
for (int j = 0; j < cols; j++) {
array[i][j] = static_cast<T*>(malloc(sizeof(T) * depth));
}
}
return array;
}
template <typename T>
void free3dArray(T *** array, int rows, int cols, int depth) {
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
free(array[i][j]);
}
free(array[i]);
}
free(array);
}
void * allocBuffer(size_t num_bytes) {
#ifdef NO_SIM
return VTAMemAlloc(num_bytes, VTA_CACHED);
#else
return malloc(num_bytes);
#endif
}
void freeBuffer(void * buffer) {
#ifdef NO_SIM
return VTAMemFree(buffer);
#else
return free(buffer);
#endif
}
VTAGenericInsn get2DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset,
int y_size, int x_size, int x_stride, int y_pad, int x_pad, int pop_prev_dep, int pop_next_dep,
int push_prev_dep, int push_next_dep) {
// Converter
union VTAInsn converter;
// Memory instruction initialization
VTAMemInsn insn = {};
insn.opcode = opcode;
insn.pop_prev_dep = pop_prev_dep;
insn.pop_next_dep = pop_next_dep;
insn.push_prev_dep = push_prev_dep;
insn.push_next_dep = push_next_dep;
insn.memory_type = type;
insn.sram_base = sram_offset;
insn.dram_base = dram_offset;
insn.y_size = y_size;
insn.x_size = x_size;
insn.x_stride = x_stride;
insn.y_pad_0 = y_pad;
insn.y_pad_1 = y_pad;
insn.x_pad_0 = x_pad;
insn.x_pad_1 = x_pad;
converter.mem = insn;
return converter.generic;
}
VTAGenericInsn get1DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset, int size,
int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep) {
// Converter
union VTAInsn converter;
// Memory instruction initialization
VTAMemInsn insn = {};
insn.opcode = opcode;
insn.pop_prev_dep = pop_prev_dep;
insn.pop_next_dep = pop_next_dep;
insn.push_prev_dep = push_prev_dep;
insn.push_next_dep = push_next_dep;
insn.memory_type = type;
insn.sram_base = sram_offset;
insn.dram_base = dram_offset;
insn.y_size = 1;
insn.x_size = size;
insn.x_stride = size;
insn.y_pad_0 = 0;
insn.y_pad_1 = 0;
insn.x_pad_0 = 0;
insn.x_pad_1 = 0;
converter.mem = insn;
return converter.generic;
}
VTAGenericInsn getGEMMInsn(int uop_offset, int batch, int in_feat, int out_feat,
bool uop_compression, int pop_prev_dep, int pop_next_dep, int push_prev_dep,
int push_next_dep) {
// Converter
union VTAInsn converter;
// GEMM instruction initialization
VTAGemInsn insn;
insn.opcode = VTA_OPCODE_GEMM;
insn.pop_prev_dep = pop_prev_dep;
insn.pop_next_dep = pop_next_dep;
insn.push_prev_dep = push_prev_dep;
insn.push_next_dep = push_next_dep;
insn.reset_reg = false;
if (!uop_compression) {
insn.uop_bgn = uop_offset;
insn.uop_end = uop_offset + batch * in_feat * out_feat;
insn.iter_out = 1;
insn.iter_in = 1;
insn.dst_factor_out = 0;
insn.src_factor_out = 0;
insn.wgt_factor_out = 0;
insn.dst_factor_in = 0;
insn.src_factor_in = 0;
insn.wgt_factor_in = 0;
} else {
insn.uop_bgn = uop_offset;
insn.uop_end = uop_offset + batch;
insn.iter_out = in_feat;
insn.iter_in = out_feat;
insn.dst_factor_out = 0;
insn.src_factor_out = 1;
insn.wgt_factor_out = 1;
insn.dst_factor_in = 1;
insn.src_factor_in = 0;
insn.wgt_factor_in = in_feat;
}
converter.gemm = insn;
return converter.generic;
}
VTAGenericInsn getALUInsn(int opcode, int vector_size, bool use_imm, int imm, bool uop_compression,
int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep) {
// Converter
union VTAInsn converter;
// Memory instruction initialization
VTAAluInsn insn = {};
insn.opcode = VTA_OPCODE_ALU;
insn.pop_prev_dep = pop_prev_dep;
insn.pop_next_dep = pop_next_dep;
insn.push_prev_dep = push_prev_dep;
insn.push_next_dep = push_next_dep;
insn.reset_reg = false;
if (!uop_compression) {
insn.uop_bgn = 0;
insn.uop_end = vector_size;
insn.iter_out = 1;
insn.iter_in = 1;
insn.dst_factor_out = 0;
insn.src_factor_out = 0;
insn.dst_factor_in = 0;
insn.src_factor_in = 0;
insn.alu_opcode = opcode;
insn.use_imm = use_imm;
insn.imm = imm;
} else {
insn.uop_bgn = 0;
insn.uop_end = 1;
insn.iter_out = 1;
insn.iter_in = vector_size;
insn.dst_factor_out = 0;
insn.src_factor_out = 0;
insn.dst_factor_in = 1;
insn.src_factor_in = 1;
insn.alu_opcode = opcode;
insn.use_imm = use_imm;
insn.imm = imm;
}
converter.alu = insn;
return converter.generic;
}
VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next) {
// Converter
union VTAInsn converter;
// GEMM instruction initialization
VTAGemInsn insn;
insn.opcode = VTA_OPCODE_FINISH;
insn.pop_prev_dep = pop_prev;
insn.pop_next_dep = pop_next;
insn.push_prev_dep = 0;
insn.push_next_dep = 0;
insn.reset_reg = false;
insn.uop_bgn = 0;
insn.uop_end = 0;
insn.iter_out = 0;
insn.iter_in = 0;
insn.dst_factor_out = 0;
insn.src_factor_out = 0;
insn.wgt_factor_out = 0;
insn.dst_factor_in = 0;
insn.src_factor_in = 0;
insn.wgt_factor_in = 0;
converter.gemm = insn;
return converter.generic;
}
VTAUop * getCopyUops(int y_size, int x_size, int uop_compression) {
// Derive the total uop size
int uop_size = (uop_compression) ? 1 : y_size * x_size;
// Allocate buffer
#ifdef NO_SIM
VTAUop *uop_buf = static_cast<VTAUop *>(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED));
#else
VTAUop *uop_buf = static_cast<VTAUop *>(malloc(sizeof(VTAUop) * uop_size));
#endif
if (!uop_compression) {
int uop_idx = 0;
for (int i = 0; i < y_size; i++) {
for (int j = 0; j < x_size; j++) {
uop_buf[uop_idx].dst_idx = i * x_size + j;
uop_buf[uop_idx].src_idx = 0;
uop_buf[uop_idx].wgt_idx = 0;
uop_idx++;
}
}
} else {
uop_buf[0].dst_idx = 1;
uop_buf[0].src_idx = 0;
uop_buf[0].wgt_idx = 0;
}
return uop_buf;
}
VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
bool multi_threaded) {
// Derive the total uop size
int uop_size = (uop_compression) ? batch : batch * in_feat * out_feat;
if (multi_threaded) uop_size *= 2;
// Allocate buffer
#ifdef NO_SIM
VTAUop *uop_buf = static_cast<VTAUop *>(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED));
#else
VTAUop *uop_buf = static_cast<VTAUop *>(malloc(sizeof(VTAUop) * uop_size));
#endif
if (!uop_compression) {
int uop_idx = 0;
for (int i = 0; i < batch; i++) {
for (int j = 0; j < in_feat; j++) {
for (int k = 0; k < out_feat; k++) {
uop_buf[uop_idx].dst_idx = i * out_feat + k;
uop_buf[uop_idx].src_idx = i * in_feat + j;
uop_buf[uop_idx].wgt_idx = k * in_feat + j;
uop_idx++;
}
}
}
} else {
for (int i = 0; i < batch; i++) {
uop_buf[i].dst_idx = i * out_feat;
uop_buf[i].src_idx = i * in_feat;
uop_buf[i].wgt_idx = 0;
}
}
if (multi_threaded) {
if (!uop_compression) {
int uop_idx = uop_size / 2;
for (int i = 0; i < batch; i++) {
for (int j = 0; j < in_feat; j++) {
for (int k = 0; k < out_feat; k++) {
uop_buf[uop_idx].dst_idx = i * out_feat + k;
uop_buf[uop_idx].src_idx = batch * in_feat + i * in_feat + j;
uop_buf[uop_idx].wgt_idx = out_feat * in_feat + k * in_feat + j;
uop_idx++;
}
}
}
} else {
for (int i = 0; i < batch; i++) {
uop_buf[batch+i].dst_idx = i * out_feat;
uop_buf[batch+i].src_idx = batch * in_feat + i * in_feat;
uop_buf[batch+i].wgt_idx = out_feat * in_feat;
}
}
}
return uop_buf;
}
VTAUop * getMapALUUops(int vector_size, bool uop_compression) {
// Derive the total uop size
int uop_size = (uop_compression) ? 1 : vector_size;
// Allocate buffer
#ifdef NO_SIM
VTAUop *uop_buf = static_cast<VTAUop *>(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED));
#else
VTAUop *uop_buf = static_cast<VTAUop *>(malloc(sizeof(VTAUop) * uop_size));
#endif
if (!uop_compression) {
for (int i = 0; i < vector_size; i++) {
uop_buf[i].dst_idx = i;
uop_buf[i].src_idx = vector_size + i;
}
} else {
uop_buf[0].dst_idx = 0;
uop_buf[0].src_idx = vector_size;
}
return uop_buf;
}
void printParameters() {
// Some debugging code
printf("Size of VTAInsn: %d\n", sizeof(VTAGenericInsn));
printf("Size of VTAUop: %d\n", sizeof(VTAUop));
printf("VTA_UOP_BUFF_DEPTH: %d\n", VTA_UOP_BUFF_DEPTH);
printf("VTA_LOG_UOP_BUFF_DEPTH: %d\n", VTA_LOG_UOP_BUFF_DEPTH);
printf("VTA_WGT_BUFF_DEPTH: %d\n", VTA_WGT_BUFF_DEPTH);
printf("VTA_LOG_WGT_BUFF_DEPTH: %d\n", VTA_LOG_WGT_BUFF_DEPTH);
printf("VTA_INP_BUFF_DEPTH: %d\n", VTA_INP_BUFF_DEPTH);
printf("VTA_LOG_INP_BUFF_DEPTH: %d\n", VTA_LOG_INP_BUFF_DEPTH);
printf("VTA_ACC_BUFF_DEPTH: %d\n", VTA_ACC_BUFF_DEPTH);
printf("VTA_LOG_ACC_BUFF_DEPTH: %d\n", VTA_LOG_ACC_BUFF_DEPTH);
printf("VTA_WGT_WORDS: %d\n", VTA_WGT_BUFF_DEPTH*VTA_BLOCK_IN*VTA_BLOCK_OUT);
printf("VTA_INP_WORDS: %d\n", VTA_INP_BUFF_DEPTH*VTA_BLOCK_IN);
printf("VTA_ACC_WORDS: %d\n", VTA_ACC_BUFF_DEPTH*VTA_BLOCK_OUT);
printf("VTA_INS_ELEM_BYTES: %d\n", VTA_INS_ELEM_BYTES);
printf("VTA_UOP_ELEM_BYTES: %d\n", VTA_UOP_ELEM_BYTES);
printf("VTA_INP_ELEM_BYTES: %d\n", VTA_INP_ELEM_BYTES);
printf("VTA_WGT_ELEM_BYTES: %d\n", VTA_WGT_ELEM_BYTES);
printf("VTA_ACC_ELEM_BYTES: %d\n", VTA_ACC_ELEM_BYTES);
printf("VTA_BLOCK_IN: %d\n", VTA_BLOCK_IN);
printf("VTA_BLOCK_OUT: %d\n", VTA_BLOCK_OUT);
}
void printInstruction(int num_insn, VTAGenericInsn *insns) {
// Keep tabs on dependence queues
int l2g_queue = 0;
int g2l_queue = 0;
int s2g_queue = 0;
int g2s_queue = 0;
// Converter
union VTAInsn c;
// Iterate over all instructions
printf("DEBUG - There are %u instructions\n", num_insn);
for (int i = 0; i < num_insn; i++) {
// Fetch instruction and decode opcode
c.generic = insns[i];
printf("DEBUG - INSTRUCTION %u: ", i);
if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
// Print instruction field information
if (c.mem.opcode == VTA_OPCODE_LOAD) {
printf("LOAD ");
if (c.mem.memory_type == VTA_MEM_ID_UOP) printf("UOP\n");
if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n");
if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n");
if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n");
}
if (c.mem.opcode == VTA_OPCODE_STORE) {
printf("STORE ACC\n");
}
printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
static_cast<int>(c.mem.pop_prev_dep),
static_cast<int>(c.mem.pop_next_dep),
static_cast<int>(c.mem.push_prev_dep),
static_cast<int>(c.mem.push_next_dep));
printf("\tDRAM: 0x%08x, SRAM:0x%04x\n",
static_cast<int>(c.mem.dram_base),
static_cast<int>(c.mem.sram_base));
printf("\ty: size=%d, pad=[%d, %d]\n",
static_cast<int>(c.mem.y_size),
static_cast<int>(c.mem.y_pad_0),
static_cast<int>(c.mem.y_pad_1));
printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n",
static_cast<int>(c.mem.x_size),
static_cast<int>(c.mem.x_stride),
static_cast<int>(c.mem.x_pad_0),
static_cast<int>(c.mem.x_pad_1));
if (c.mem.opcode == VTA_OPCODE_STORE) {
if (c.mem.pop_prev_dep) g2s_queue--;
if (c.mem.push_prev_dep) s2g_queue++;
} else if (c.mem.opcode == VTA_OPCODE_LOAD &&
(c.mem.memory_type == VTA_MEM_ID_INP || c.mem.memory_type == VTA_MEM_ID_WGT)) {
if (c.mem.pop_next_dep) g2l_queue--;
if (c.mem.push_next_dep) l2g_queue++;
} else {
if (c.mem.pop_prev_dep) l2g_queue--;
if (c.mem.push_prev_dep) g2l_queue++;
if (c.mem.pop_next_dep) s2g_queue--;
if (c.mem.push_next_dep) g2s_queue++;
}
} else if (c.mem.opcode == VTA_OPCODE_GEMM) {
// Print instruction field information
printf("GEMM\n");
printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
static_cast<int>(c.mem.pop_prev_dep),
static_cast<int>(c.mem.pop_next_dep),
static_cast<int>(c.mem.push_prev_dep),
static_cast<int>(c.mem.push_next_dep));
printf("\trange (%d, %d)\n",
static_cast<int>(c.gemm.uop_bgn),
static_cast<int>(c.gemm.uop_end));
printf("\treset_out: %d\n", static_cast<int>(c.gemm.reset_reg));
printf("\touter loop - iter: %d, acc: %d, inp: %d, wgt: %d\n",
static_cast<int>(c.gemm.iter_out),
static_cast<int>(c.gemm.dst_factor_out),
static_cast<int>(c.gemm.src_factor_out),
static_cast<int>(c.gemm.wgt_factor_out));
printf("\tinner loop - iter: %d, acc: %d, inp: %d, wgt: %d\n",
static_cast<int>(c.gemm.iter_in),
static_cast<int>(c.gemm.dst_factor_in),
static_cast<int>(c.gemm.src_factor_in),
static_cast<int>(c.gemm.wgt_factor_in));
if (c.gemm.pop_prev_dep) l2g_queue--;
if (c.gemm.push_prev_dep) g2l_queue++;
if (c.gemm.pop_next_dep) s2g_queue--;
if (c.gemm.push_next_dep) g2s_queue++;
} else if (c.mem.opcode == VTA_OPCODE_FINISH) {
printf("FINISH\n");
printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
static_cast<int>(c.mem.pop_prev_dep),
static_cast<int>(c.mem.pop_next_dep),
static_cast<int>(c.mem.push_prev_dep),
static_cast<int>(c.mem.push_next_dep));
if (c.gemm.pop_prev_dep) l2g_queue--;
if (c.gemm.push_prev_dep) g2l_queue++;
if (c.gemm.pop_next_dep) s2g_queue--;
if (c.gemm.push_next_dep) g2s_queue++;
} else if (c.mem.opcode == VTA_OPCODE_ALU) {
// Print instruction field information
printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm));
printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
static_cast<int>(c.mem.pop_prev_dep),
static_cast<int>(c.mem.pop_next_dep),
static_cast<int>(c.mem.push_prev_dep),
static_cast<int>(c.mem.push_next_dep));
printf("\treset_out: %d\n", static_cast<int>(c.alu.reset_reg));
printf("\trange (%d, %d)\n",
static_cast<int>(c.alu.uop_bgn),
static_cast<int>(c.alu.uop_end));
printf("\touter loop - iter: %d, dst: %d, src: %d\n",
static_cast<int>(c.alu.iter_out),
static_cast<int>(c.alu.dst_factor_out),
static_cast<int>(c.alu.src_factor_out));
printf("\tinner loop - iter: %d, dst: %d, src: %d\n",
static_cast<int>(c.alu.iter_in),
static_cast<int>(c.alu.dst_factor_in),
static_cast<int>(c.alu.src_factor_in));
if (c.alu.pop_prev_dep) l2g_queue--;
if (c.alu.push_prev_dep) g2l_queue++;
if (c.alu.pop_next_dep) s2g_queue--;
if (c.alu.push_next_dep) g2s_queue++;
}
}
printf("DEBUG - l2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
printf("DEBUG - s2g_queue = %d, g2s_queue = %d\n", s2g_queue, g2s_queue);
}
// Helper function: Print micro-ops status
void printMicroOp(int num_uop, VTAUop *uops) {
// Iterate over all micro ops
printf("DEBUG - There are %u micro-ops\n", num_uop);
for (int i = 0; i < num_uop; i++) {
// Read micro-op
printf("DEBUG - UOP %u: ", i);
printf("acc=%u, inp= %u, wgt=%u\n", uops[i].dst_idx, uops[i].src_idx, uops[i].wgt_idx);
}
}
int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression) {
// Some assertions
assert(batch % VTA_BATCH == 0);
assert(vector_size % VTA_BLOCK_OUT == 0);
printf("=====================================================================================\n");
printf("INFO - ALU test of %s: batch=%d, vector_size=%d, uop_compression=%d\n",
getOpcodeString(opcode, use_imm), batch, vector_size, uop_compression);
// Instruction count
int ins_size = 3 * batch / VTA_BATCH + 2;
// Micro op count
int uop_size = uop_compression ? 1 : vector_size / VTA_BLOCK_OUT;
// Input/output elements in each transfer
int tx_size = vector_size / VTA_BLOCK_OUT;
// Number of input sets to be generated
int input_sets = (use_imm) ? 1 : 2;
// Make sure we don't exceed buffer bounds
assert(uop_size <= VTA_UOP_BUFF_DEPTH);
assert(tx_size * input_sets <= VTA_ACC_BUFF_DEPTH);
// Immediate values
acc_T *immediate = static_cast<acc_T *>(malloc(sizeof(acc_T) * batch / VTA_BATCH));
for (int b = 0; b < batch / VTA_BATCH; b++) {
if (opcode == VTA_ALU_OPCODE_MIN) {
immediate[b] = static_cast<acc_T>(
rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
} else if (opcode == VTA_ALU_OPCODE_MAX) {
immediate[b] = static_cast<acc_T>(
rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
} else if (opcode == VTA_ALU_OPCODE_ADD) {
immediate[b] = static_cast<acc_T>(
rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
} else if (opcode == VTA_ALU_OPCODE_SHR) {
immediate[b] = static_cast<acc_T>(
rand_r(&globalSeed) % (1LL << (VTA_SHR_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_SHR_ARG_BIT_WIDTH - 2)));
}
// else if (opcode == VTA_ALU_OPCODE_MUL) {
// immediate[b] = static_cast<acc_T>(
// rand_r(&globalSeed) % (1LL << (VTA_MUL_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_MUL_ARG_BIT_WIDTH - 2)));
// }
}
// Initialize instructions
VTAGenericInsn *insn_buf =
static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
int insn_idx = 0;
insn_buf[insn_idx++] =
get1DLoadStoreInsn(VTA_OPCODE_LOAD, VTA_MEM_ID_UOP, 0, 0, uop_size, 0, 0, 0, 0);
for (int b = 0; b < batch; b += VTA_BATCH) {
insn_buf[insn_idx++] = get2DLoadStoreInsn(
VTA_OPCODE_LOAD, // opcode
VTA_MEM_ID_ACC, // vector size
0, // sram offset
b / VTA_BATCH * tx_size * input_sets, // dram offset
1, // y size
tx_size * input_sets, // x size
tx_size * input_sets, // x stride
0, // y pad
0, // x pad
0, // pop prev dep
b > 0, // pop next dep
0, // push prev dep
0); // push next dep
insn_buf[insn_idx++] = getALUInsn(
opcode, // opcode
tx_size, // vector size
use_imm, // use imm
immediate[b / VTA_BATCH], // imm
uop_compression, // uop compression
0, // pop prev dep
0, // pop next dep
0, // push prev dep
1); // push next dep
insn_buf[insn_idx++] = get2DLoadStoreInsn(
VTA_OPCODE_STORE, // opcode
VTA_MEM_ID_OUT, // vector size
0, // sram offset
b / VTA_BATCH * tx_size, // dram offset
1, // y size
tx_size, // x size
tx_size, // x stride
0, // y pad
0, // x pad
1, // pop prev dep
0, // pop next dep
1, // push prev dep
0); // push next dep
}
// Finish
insn_buf[insn_idx++] = getFinishInsn(0, 1);
// Prepare the uop buffer
VTAUop * uop_buf = getMapALUUops(tx_size, uop_compression);
#if VTA_DEBUG == 1
printInstruction(ins_size, insn_buf);
printMicroOp(uop_size, uop_buf);
#endif
// Initialize the input/output data
acc_T **inputs = alloc2dArray<acc_T>(batch, vector_size * input_sets);
for (int i = 0; i < batch; i++) {
for (int j = 0; j < vector_size * input_sets; j++) {
if (opcode == VTA_ALU_OPCODE_MIN) {
inputs[i][j] = static_cast<acc_T>(
rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
} else if (opcode == VTA_ALU_OPCODE_MAX) {
inputs[i][j] = static_cast<acc_T>(
rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
} else if (opcode == VTA_ALU_OPCODE_ADD) {
inputs[i][j] = static_cast<acc_T>(
rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 2)) - (1LL << (VTA_INP_WIDTH - 3)));
} else if (opcode == VTA_ALU_OPCODE_SHR) {
inputs[i][j] = static_cast<acc_T>(
rand_r(&globalSeed) % (1LL << (VTA_SHR_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_SHR_ARG_BIT_WIDTH - 2)));
}
}
}
// Compute reference output
out_T **outputs_ref = alloc2dArray<out_T>(batch, vector_size);
for (int i = 0; i < batch; i++) {
for (int j = 0; j < vector_size; j++) {
acc_T out_val = 0;
acc_T imm_val = immediate[i / VTA_BATCH];
acc_T src_val = inputs[i][j + vector_size];
if (opcode == VTA_ALU_OPCODE_MIN) {
if (!use_imm) {
out_val = inputs[i][j] < src_val ? inputs[i][j] : src_val;
} else {
out_val = inputs[i][j] < imm_val ? inputs[i][j] : imm_val;
}
} else if (opcode == VTA_ALU_OPCODE_MAX) {
if (!use_imm) {
out_val = inputs[i][j] > src_val ? inputs[i][j] : src_val;
} else {
out_val = inputs[i][j] > imm_val ? inputs[i][j] : imm_val;
}
} else if (opcode == VTA_ALU_OPCODE_ADD) {
if (!use_imm) {
out_val = inputs[i][j] + src_val;
} else {
out_val = inputs[i][j] + imm_val;
}
} else if (opcode == VTA_ALU_OPCODE_SHR) {
if (!use_imm) {
if (src_val >= 0) {
out_val = inputs[i][j] >> src_val;
} else {
out_val = inputs[i][j] << (0 - src_val);
}
} else {
if (imm_val >= 0) {
out_val = inputs[i][j] >> imm_val;
} else {
out_val = inputs[i][j] << (0 - imm_val);
}
}
}
outputs_ref[i][j] = (out_T) out_val;
}
}
// Pack input buffer
uint32_t *bias_buf = static_cast<uint32_t *>(
allocBuffer(VTA_ACC_ELEM_BYTES * batch * tx_size * input_sets));
packBuffer<uint32_t, 32, acc_T, VTA_ACC_WIDTH>(
bias_buf, inputs, batch, vector_size * input_sets, VTA_BATCH, VTA_BLOCK_OUT);
// Prepare output buffer
uint32_t *output_buf = static_cast<uint32_t *>(
allocBuffer(VTA_OUT_ELEM_BYTES * batch * tx_size * input_sets));
#ifdef NO_SIM
// Invoke the VTA
uint64_t t_fpga = vta(ins_size, insn_buf, uop_buf, NULL, NULL, bias_buf, output_buf);
// Report on timining
printf("INFO - Synchronization time: %.3fms\n", static_cast<float>(t_fpga) / 1E6);
printf("INFO - Throughput: %.3fGOps/s\n", static_cast<float>(vector_size * batch) / t_fpga);
#else
// Invoke the VTA
vta(ins_size,
(volatile insn_T *) insn_buf,
(volatile uop_T *) uop_buf,
(volatile bus_T *) NULL,
(volatile bus_T *) NULL,
(volatile bus_T *) bias_buf,
(volatile bus_T *) output_buf);
#endif
// Unpack output buffer
out_T **outputs = alloc2dArray<out_T>(batch, vector_size);
unpackBuffer<out_T, VTA_OUT_WIDTH, uint32_t, 32>(outputs,
output_buf,
batch,
vector_size,
VTA_BATCH,
VTA_BLOCK_OUT);
// Correctness checks
int err = 0;
for (int i = 0; i < batch; i++) {
for (int j = 0; j < vector_size; j++) {
if (outputs_ref[i][j] != outputs[i][j]) {
err++;
#if VTA_DEBUG == 1
printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
static_cast<int>(outputs_ref[i][j]),
static_cast<int>(outputs[i][j]));
#endif
}
}
}
// Free all allocated arrays
free(immediate);
free2dArray<acc_T>(inputs, batch, vector_size * input_sets);
free2dArray<out_T>(outputs_ref, batch, vector_size);
free2dArray<out_T>(outputs, batch, vector_size);
freeBuffer(insn_buf);
freeBuffer(uop_buf);
freeBuffer(bias_buf);
freeBuffer(output_buf);
if (err == 0) {
printf("INFO - ALU test successful!\n");
return 0;
} else {
printf("INFO - ALU test failed, got %d errors!\n", err);
return -1;
}
}
int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
int virtual_threads) {
// Some assertions
assert(block % VTA_BLOCK_IN == 0);
assert(block % VTA_BLOCK_OUT == 0);
assert(block % VTA_BATCH == 0);
assert(channels % block == 0);
assert(batch % block == 0);
printf("=====================================================================================\n");
printf("INFO - Blocked GEMM test: batch=%d, channels=%d, block=%d, uop_comp=%d, vt=%d\n",
batch, channels, block, uop_compression, virtual_threads);
// Input/output channels
int in_feat = channels;
int out_feat = channels;
// Derive number of elements that need to be loaded/stored
int ins_size = batch / block * out_feat / block * (2 + in_feat / block * 3) + 2;
int uop_size = uop_compression ?
block / VTA_BATCH * virtual_threads :
block / VTA_BATCH * block / VTA_BLOCK_IN * block / VTA_BLOCK_OUT * virtual_threads;
int inp_size = batch / VTA_BATCH * in_feat / VTA_BLOCK_IN;
int wgt_size = in_feat / VTA_BLOCK_IN * out_feat / VTA_BLOCK_OUT;
int out_size = batch / VTA_BATCH * out_feat / VTA_BLOCK_OUT;
// Blocked buffer sizes (in terms of elements)
int inp_block_size = block / VTA_BATCH * block / VTA_BLOCK_IN;
int wgt_block_size = block / VTA_BLOCK_IN * block / VTA_BLOCK_OUT;
int out_block_size = block / VTA_BATCH * block / VTA_BLOCK_OUT;
// Make sure we don't exceed buffer bounds
assert(uop_size <= VTA_UOP_BUFF_DEPTH);
assert(inp_block_size <= VTA_INP_BUFF_DEPTH);
assert(wgt_block_size <= VTA_WGT_BUFF_DEPTH);
assert(out_block_size <= VTA_ACC_BUFF_DEPTH);
// Initialize instruction buffer
VTAGenericInsn *insn_buf =
static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
int insn_idx = 0;
// Load uops
insn_buf[insn_idx++] = get1DLoadStoreInsn(VTA_OPCODE_LOAD,
VTA_MEM_ID_UOP,
0,
0,
uop_size,
0,
0,
0,
0);
// Iterate over batch blocks
for (int i = 0; i < batch; i += block) {
// Iterate over output channel blocks
for (int j = 0; j < out_feat; j += block) {
// Load bias block (pop next if not first, push prev)
insn_buf[insn_idx++] = get2DLoadStoreInsn(
VTA_OPCODE_LOAD, // opcode
VTA_MEM_ID_ACC, // type
0, // sram offset
(i / VTA_BATCH * out_feat + j) / VTA_BLOCK_OUT, // dram offset
block / VTA_BATCH, // y size
block / VTA_BLOCK_OUT, // x size
out_feat / VTA_BLOCK_OUT, // x stride
0, // y pad
0, // x pad
0, // pop prev dep
(i > 0 || j > 0), // pop next dep
(virtual_threads == 1), // push prev dep
0); // push next dep
// Iterate over input channel blocks
for (int k = 0; k < in_feat; k += block * virtual_threads) {
for (int l = 0; l < block * virtual_threads; l += block) {
// Derive dependence flags
bool pop = (virtual_threads == 1) ?
1 :
(i > 0 || j > 0 || k > 0 || l > 0) && (k + l != block * virtual_threads - block);
bool push_prev = (virtual_threads == 1) ?
((k + l) != in_feat - block) :
((k + l) != in_feat - virtual_threads * block) &&
(
(k + l != in_feat - block) ||
(j != out_feat - block) ||
(i != batch - block));
bool push_next = (k + l == in_feat - block);
// Load weight block (pop next)
insn_buf[insn_idx++] = get2DLoadStoreInsn(
VTA_OPCODE_LOAD, // opcode
VTA_MEM_ID_WGT, // type
l / VTA_BLOCK_IN * block / VTA_BLOCK_OUT, // sram offset
(j / VTA_BLOCK_OUT * in_feat + k + l) / VTA_BLOCK_IN, // dram offset
block / VTA_BLOCK_OUT, // y size
block / VTA_BLOCK_IN, // x size
in_feat / VTA_BLOCK_IN, // x stride
0, // y pad
0, // x pad
0, // pop prev dep
pop, // pop next dep
0, // push prev dep
0); // push next dep
// Load input block (push next)
insn_buf[insn_idx++] = get2DLoadStoreInsn(
VTA_OPCODE_LOAD, // opcode
VTA_MEM_ID_INP, // type
l / VTA_BLOCK_IN * block / VTA_BATCH, // sram offset
(i / VTA_BATCH * in_feat + k + l) / VTA_BLOCK_IN, // dram offset
block / VTA_BATCH, // y size
block / VTA_BLOCK_IN, // x size
in_feat / VTA_BLOCK_IN, // x stride
0, // y pad
0, // x pad
0, // pop prev dep
0, // pop next dep
0, // push prev dep
1); // push next dep
// Perform GEMM (pop prev, push prev if not last, push next if last)
insn_buf[insn_idx++] = getGEMMInsn(
l / block * uop_size / virtual_threads, // uop offset
block / VTA_BATCH, // batch
block / VTA_BLOCK_IN, // in_feat
block / VTA_BLOCK_OUT, // out_feat
uop_compression, // uop_compression
1, // pop_prev_dep
0, // pop_next_dep
push_prev, // push prev dep
push_next); // push_next_dep
}
}
// Store output block (pop prev, push prev if not last)
insn_buf[insn_idx++] = get2DLoadStoreInsn(
VTA_OPCODE_STORE, // opcode
VTA_MEM_ID_OUT, // type
0, // sram offset
(i / VTA_BATCH * out_feat + j) / VTA_BLOCK_OUT, // dram offset
block / VTA_BATCH, // y size
block / VTA_BLOCK_OUT, // x size
out_feat / VTA_BLOCK_OUT, // x stride
0, // y pad
0, // x pad
1, // pop prev dep
0, // pop next dep
1, // pop prev dep
0); // push next dep
}
}
// Finish
insn_buf[insn_idx++] = getFinishInsn(0, 1);
// Prepare the uop buffer
VTAUop * uop_buf = getGEMMUops(
block / VTA_BATCH,
block / VTA_BLOCK_IN,
block / VTA_BLOCK_OUT,
uop_compression,
virtual_threads > 1);
#if VTA_DEBUG == 1
printInstruction(ins_size, insn_buf);
printMicroOp(uop_size, uop_buf);
#endif
// Initialize inputs
inp_T **inputs = allocInit2dArray<inp_T>(batch, in_feat);
// Initialize weights
wgt_T **weights = allocInit2dArray<wgt_T>(out_feat, in_feat);
// Initialize biases
acc_T **biases = allocInit2dArray<acc_T>(batch, out_feat);
// Reference GEMM implementation
out_T **outputs_ref = alloc2dArray<out_T>(batch, out_feat);
for (int i = 0; i < batch; i++) {
for (int j = 0; j < out_feat; j++) {
acc_T sum = biases[i][j];
for (int k = 0; k < in_feat; k++) {
sum += (acc_T) (inputs[i][k] * weights[j][k]);
}
// Set
outputs_ref[i][j] = (out_T) sum;
}
}
// Prepare the input buffer
uint32_t *input_buf = static_cast<uint32_t *>(
allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
packBuffer<uint32_t, 32, inp_T, VTA_INP_WIDTH>(input_buf,
inputs,
batch,
in_feat,
VTA_BATCH,
VTA_BLOCK_IN);
// Prepare the weight buffer
uint32_t *weight_buf = static_cast<uint32_t *>(
allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
packBuffer<uint32_t, 32, wgt_T, VTA_WGT_WIDTH>(weight_buf,
weights,
out_feat,
in_feat,
VTA_BLOCK_OUT,
VTA_BLOCK_IN);
// Prepare the bias buffer
uint32_t *bias_buf = static_cast<uint32_t *>(
allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
packBuffer<uint32_t, 32, acc_T, VTA_ACC_WIDTH>(bias_buf,
biases,
batch,
out_feat,
VTA_BATCH,
VTA_BLOCK_OUT);
// Prepare the output buffer
uint32_t *output_buf = static_cast<uint32_t *>(
allocBuffer(VTA_INP_ELEM_BYTES * out_size));
#ifdef NO_SIM
// Invoke the VTA
uint64_t t_fpga = vta(ins_size,
insn_buf,
uop_buf,
input_buf,
weight_buf,
bias_buf,
output_buf);
// Report on timining
printf("INFO - Synchronization time: %.3lfms\n", static_cast<float>(t_fpga) / 1E6);
printf("INFO - Throughput: %.3lfGOPs/s\n",
static_cast<float>(batch) * in_feat * out_feat * 2 / t_fpga);
#else
// Invoke the VTA
vta(ins_size,
(volatile insn_T *) insn_buf,
(volatile uop_T *) uop_buf,
(volatile bus_T *) input_buf,
(volatile bus_T *) weight_buf,
(volatile bus_T *) bias_buf,
(volatile bus_T *) output_buf);
#endif
// Unpack output data
out_T **outputs = alloc2dArray<out_T>(batch, out_feat);
unpackBuffer<out_T, VTA_OUT_WIDTH, uint32_t, 32>(outputs,
output_buf,
batch,
out_feat,
VTA_BATCH,
VTA_BLOCK_OUT);
// Correctness checks
int err = 0;
for (int i = 0; i < batch; i++) {
for (int j = 0; j < out_feat; j++) {
if (outputs_ref[i][j] != outputs[i][j]) {
err++;
#if VTA_DEBUG == 1
printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
static_cast<int>(outputs_ref[i][j]),
static_cast<int>(outputs[i][j]));
#endif
}
}
}
// Free all allocated arrays
free2dArray<inp_T>(inputs, batch, in_feat);
free2dArray<wgt_T>(weights, out_feat, in_feat);
free2dArray<acc_T>(biases, batch, out_feat);
free2dArray<out_T>(outputs_ref, batch, out_feat);
free2dArray<out_T>(outputs, batch, out_feat);
freeBuffer(insn_buf);
freeBuffer(uop_buf);
freeBuffer(input_buf);
freeBuffer(weight_buf);
freeBuffer(bias_buf);
freeBuffer(output_buf);
if (err == 0) {
printf("INFO - Blocked GEMM test successful!\n");
return 0;
} else {
printf("INFO - Blocked GEMM test failed, got %d errors!\n", err);
return -1;
}
}
int gemm_test(int batch, int in_channels, int out_channels, bool uop_compression) {
// Some assertions
assert(batch % VTA_BATCH == 0);
assert(in_channels % VTA_BLOCK_IN == 0);
assert(out_channels % VTA_BLOCK_OUT == 0);
printf("=====================================================================================\n");
printf("INFO - Blocked GEMM test: batch=%d, in_channels=%d, out_channels=%d, uop_comp=%d\n",
batch, in_channels, out_channels, uop_compression);
// Derive number of elements that need to be loaded/stored
int ins_size = 7;
int uop_size = uop_compression ?
batch / VTA_BATCH :
batch / VTA_BATCH * in_channels / VTA_BLOCK_IN * out_channels / VTA_BLOCK_OUT;
int inp_size = batch / VTA_BATCH * in_channels / VTA_BLOCK_IN;
int wgt_size = in_channels / VTA_BLOCK_IN * out_channels / VTA_BLOCK_OUT;
int out_size = batch / VTA_BATCH * out_channels / VTA_BLOCK_OUT;
// Make sure we don't exceed buffer bounds
assert(uop_size <= VTA_UOP_BUFF_DEPTH);
assert(inp_size <= VTA_INP_BUFF_DEPTH);
assert(wgt_size <= VTA_WGT_BUFF_DEPTH);
assert(out_size <= VTA_ACC_BUFF_DEPTH);
// Initialize instruction buffer
VTAGenericInsn *insn_buf =
static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
int insn_idx = 0;
// Load uops
insn_buf[insn_idx++] = get1DLoadStoreInsn(
VTA_OPCODE_LOAD,
VTA_MEM_ID_UOP,
0,
0,
uop_size,
0,
0,
0,
0);
// Load bias
insn_buf[insn_idx++] = get1DLoadStoreInsn(
VTA_OPCODE_LOAD, // opcode
VTA_MEM_ID_ACC, // type
0, // sram offset
0, // dram offset
out_size, // size
0, // pop prev dep
0, // pop next dep
1, // push prev dep
0); // push next dep
// Load weight block (pop next)
insn_buf[insn_idx++] = get1DLoadStoreInsn(
VTA_OPCODE_LOAD, // opcode
VTA_MEM_ID_WGT, // type
0, // sram offset
0, // dram offset
wgt_size, // size
0, // pop prev dep
1, // pop next dep
0, // push prev dep
0); // push next dep
// Load input block (push next)
insn_buf[insn_idx++] = get1DLoadStoreInsn(
VTA_OPCODE_LOAD, // opcode
VTA_MEM_ID_INP, // type
0, // sram offset
0, // dram offset
inp_size, // size
0, // pop prev dep
0, // pop next dep
0, // push prev dep
1); // push next dep
// Perform GEMM (pop prev, push prev if not last, push next if last)
insn_buf[insn_idx++] = getGEMMInsn(
0, // uop offset
batch / VTA_BATCH, // batch
in_channels / VTA_BLOCK_IN, // in_channels
out_channels / VTA_BLOCK_OUT, // out_channels
uop_compression, // uop_compression
1, // pop_prev_dep
0, // pop_next_dep
0, // push prev dep
1); // push_next_dep
// Store output block (pop prev, push prev if not last)
insn_buf[insn_idx++] = get1DLoadStoreInsn(
VTA_OPCODE_STORE, // opcode
VTA_MEM_ID_OUT, // type
0, // sram offset
0, // dram offset
out_size, // size
1, // pop prev dep
0, // pop next dep
1, // push prev dep
0); // push next dep
// Finish
insn_buf[insn_idx++] = getFinishInsn(0, 1);
// Prepare the uop buffer
VTAUop * uop_buf = getGEMMUops(
batch / VTA_BATCH,
in_channels / VTA_BLOCK_IN,
out_channels / VTA_BLOCK_OUT,
uop_compression,
0);
#if VTA_DEBUG == 1
printInstruction(ins_size, insn_buf);
printMicroOp(uop_size, uop_buf);
#endif
// Initialize inputs
inp_T **inputs = allocInit2dArray<inp_T>(batch, in_channels);
// Initialize weights
wgt_T **weights = allocInit2dArray<wgt_T>(out_channels, in_channels);
// Initialize biases
acc_T **biases = allocInit2dArray<acc_T>(batch, out_channels);
// Reference GEMM implementation
out_T **outputs_ref = alloc2dArray<out_T>(batch, out_channels);
for (int i = 0; i < batch; i++) {
for (int j = 0; j < out_channels; j++) {
acc_T sum = biases[i][j];
for (int k = 0; k < in_channels; k++) {
sum += (acc_T) (inputs[i][k] * weights[j][k]);
}
// Set
outputs_ref[i][j] = (out_T) sum;
}
}
// Prepare the input buffer
uint32_t *input_buf = static_cast<uint32_t *>(allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
packBuffer<uint32_t, 32, inp_T, VTA_INP_WIDTH>(input_buf,
inputs,
batch,
in_channels,
VTA_BATCH,
VTA_BLOCK_IN);
// Prepare the weight buffer
uint32_t *weight_buf = static_cast<uint32_t *>(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
packBuffer<uint32_t, 32, wgt_T, VTA_WGT_WIDTH>(weight_buf,
weights,
out_channels,
in_channels,
VTA_BLOCK_OUT,
VTA_BLOCK_IN);
// Prepare the bias buffer
uint32_t *bias_buf = static_cast<uint32_t *>(allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
packBuffer<uint32_t, 32, acc_T, VTA_ACC_WIDTH>(bias_buf,
biases,
batch,
out_channels,
VTA_BATCH,
VTA_BLOCK_OUT);
// Prepare the output buffer
uint32_t *output_buf = static_cast<uint32_t *>(allocBuffer(VTA_OUT_ELEM_BYTES * out_size));
#ifdef NO_SIM
// Invoke the VTA
uint64_t t_fpga = vta(ins_size,
insn_buf,
uop_buf,
input_buf,
weight_buf,
bias_buf,
output_buf);
// Report on timining
printf("INFO - Synchronization time: %.3lfms\n", static_cast<float>(t_fpga) / 1E6);
printf("INFO - Throughput: %.3lfGOPs/s\n",
static_cast<float>(batch) * in_channels * out_channels * 2 / t_fpga);
#else
// Invoke the VTA
vta(ins_size,
(volatile insn_T *) insn_buf,
(volatile uop_T *) uop_buf,
(volatile bus_T *) input_buf,
(volatile bus_T *) weight_buf,
(volatile bus_T *) bias_buf,
(volatile bus_T *) output_buf);
#endif
// Unpack output data
out_T **outputs = alloc2dArray<out_T>(batch, out_channels);
unpackBuffer<out_T, VTA_OUT_WIDTH, uint32_t, 32>(outputs,
output_buf,
batch,
out_channels,
VTA_BATCH,
VTA_BLOCK_OUT);
// Correctness checks
int err = 0;
for (int i = 0; i < batch; i++) {
for (int j = 0; j < out_channels; j++) {
if (outputs_ref[i][j] != outputs[i][j]) {
err++;
#if VTA_DEBUG == 1
printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
static_cast<int>(outputs_ref[i][j]),
static_cast<int>(outputs[i][j]));
#endif
}
}
}
// Free all allocated arrays
free2dArray<inp_T>(inputs, batch, in_channels);
free2dArray<wgt_T>(weights, out_channels, in_channels);
free2dArray<acc_T>(biases, batch, out_channels);
free2dArray<out_T>(outputs_ref, batch, out_channels);
free2dArray<out_T>(outputs, batch, out_channels);
freeBuffer(insn_buf);
freeBuffer(uop_buf);
freeBuffer(input_buf);
freeBuffer(weight_buf);
freeBuffer(bias_buf);
freeBuffer(output_buf);
if (err == 0) {
printf("INFO - Blocked GEMM test successful!\n");
return 0;
} else {
printf("INFO - Blocked GEMM test failed, got %d errors!\n", err);
return -1;
}
}