blob: 9751b2f137c82f21a60c4bad610ed8a2a84d6418 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file hw_spec.h
* \brief Preprocessor definitions for VTA HLS design and runtime.
*/
#ifndef VTA_HW_SPEC_H_
#define VTA_HW_SPEC_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdint.h>
/*! Memory bus width */
#define VTA_BUS_WIDTH (1 << VTA_LOG_BUS_WIDTH)
/*! log2 of instruction data type width */
#define VTA_LOG_INS_WIDTH 7
/*! Instruction data type width */
#define VTA_INS_WIDTH (1 << VTA_LOG_INS_WIDTH)
/*! log2 of micro op data type width */
#define VTA_LOG_UOP_WIDTH 5
/*! Micro Op data type width */
#define VTA_UOP_WIDTH (1 << VTA_LOG_UOP_WIDTH)
/*! Weight data type width */
#define VTA_WGT_WIDTH (1 << VTA_LOG_WGT_WIDTH)
/*! Input data type width */
#define VTA_INP_WIDTH (1 << VTA_LOG_INP_WIDTH)
/*! Output data type width */
#define VTA_OUT_WIDTH (1 << VTA_LOG_OUT_WIDTH)
/*! Accumulator data type width */
#define VTA_ACC_WIDTH (1 << VTA_LOG_ACC_WIDTH)
/*! Batch size (corresponds to A in (A,B)x(B,C) mat mult)*/
#define VTA_BATCH (1 << VTA_LOG_BATCH)
/*! Blocking factor of inner most loop (corresponds to B in (A,B)x(B,C) mat mult) */
#define VTA_BLOCK_IN (1 << VTA_LOG_BLOCK_IN)
/*! Blocking factor of the outer loop (corresponds to C in (A,B)x(B,C) mat mult) */
#define VTA_BLOCK_OUT (1 << VTA_LOG_BLOCK_OUT)
/*! On-chip micro-op buffer size in B */
#define VTA_UOP_BUFF_SIZE (1 << VTA_LOG_UOP_BUFF_SIZE)
/*! On-chip weight buffer size in B */
#define VTA_WGT_BUFF_SIZE (1 << VTA_LOG_WGT_BUFF_SIZE)
/*! On-chip activation buffer size in B */
#define VTA_INP_BUFF_SIZE (1 << VTA_LOG_INP_BUFF_SIZE)
/*! On-chip accumulator buffer size in B */
#define VTA_ACC_BUFF_SIZE (1 << VTA_LOG_ACC_BUFF_SIZE)
/*! Input vector size in bits */
#define VTA_INP_MATRIX_WIDTH (VTA_INP_WIDTH * VTA_BATCH * VTA_BLOCK_IN)
/*! Weight vector size in bits */
#define VTA_WGT_MATRIX_WIDTH (VTA_WGT_WIDTH * VTA_BLOCK_OUT * VTA_BLOCK_IN)
/*! Accumulator vector size in bits */
#define VTA_ACC_MATRIX_WIDTH (VTA_ACC_WIDTH * VTA_BATCH * VTA_BLOCK_OUT)
/*! Output vector size in bits */
#define VTA_OUT_MATRIX_WIDTH (VTA_OUT_WIDTH * VTA_BATCH * VTA_BLOCK_OUT)
/*! Ratio between input matrix size and axi width */
#define INP_MAT_AXI_RATIO (VTA_INP_MATRIX_WIDTH / VTA_BUS_WIDTH)
/*! Ratio between weight matrix size and axi width */
#define WGT_MAT_AXI_RATIO (VTA_WGT_MATRIX_WIDTH / VTA_BUS_WIDTH)
/*! Ratio between accumulator matrix size and axi width */
#define ACC_MAT_AXI_RATIO (VTA_ACC_MATRIX_WIDTH / VTA_BUS_WIDTH)
/*! Ratio between output matrix size and axi width */
#define OUT_MAT_AXI_RATIO (VTA_OUT_MATRIX_WIDTH / VTA_BUS_WIDTH)
/*! Size of instruction buffer element in B */
#define VTA_INS_ELEM_BYTES (VTA_INS_WIDTH / 8)
/*! Size of uop buffer element in B*/
#define VTA_UOP_ELEM_BYTES (VTA_UOP_WIDTH / 8)
/*! Size of activation buffer element in B*/
#define VTA_INP_ELEM_BYTES (VTA_INP_MATRIX_WIDTH / 8)
/*! Size of weight buffer element in B*/
#define VTA_WGT_ELEM_BYTES (VTA_WGT_MATRIX_WIDTH / 8)
/*! Size of accumulator buffer element in B*/
#define VTA_ACC_ELEM_BYTES (VTA_ACC_MATRIX_WIDTH / 8)
/*! Size of output buffer element in B*/
#define VTA_OUT_ELEM_BYTES (VTA_OUT_MATRIX_WIDTH / 8)
/*! On-chip micro-op buffer depth */
#define VTA_UOP_BUFF_DEPTH (VTA_UOP_BUFF_SIZE / VTA_UOP_ELEM_BYTES)
/*! log2 of on-chip micro-op buffer depth */
#define VTA_LOG_UOP_BUFF_DEPTH (VTA_LOG_UOP_BUFF_SIZE - VTA_LOG_UOP_WIDTH + 3)
// ! \brief On-chip weight buffer depth
#define VTA_WGT_BUFF_DEPTH (VTA_WGT_BUFF_SIZE / VTA_WGT_ELEM_BYTES)
/*! log2 of weight micro-op buffer depth */
#define VTA_LOG_WGT_BUFF_DEPTH \
(VTA_LOG_WGT_BUFF_SIZE - VTA_LOG_BLOCK_OUT - VTA_LOG_BLOCK_IN - VTA_LOG_WGT_WIDTH + 3)
/*! On-chip activation buffer depth */
#define VTA_INP_BUFF_DEPTH (VTA_INP_BUFF_SIZE / VTA_INP_ELEM_BYTES)
/*! log2 of activation micro-op buffer depth */
#define VTA_LOG_INP_BUFF_DEPTH \
(VTA_LOG_INP_BUFF_SIZE - VTA_LOG_BATCH - VTA_LOG_BLOCK_IN - VTA_LOG_INP_WIDTH + 3)
/*! On-chip accumulator buffer depth */
#define VTA_ACC_BUFF_DEPTH (VTA_ACC_BUFF_SIZE / VTA_ACC_ELEM_BYTES)
/*! log2 of on-chip accumulator buffer depth */
#define VTA_LOG_ACC_BUFF_DEPTH \
(VTA_LOG_ACC_BUFF_SIZE - VTA_LOG_BATCH - VTA_LOG_BLOCK_OUT - VTA_LOG_ACC_WIDTH + 3)
/*! Instruction opcode field bitwidth */
#define VTA_OPCODE_BIT_WIDTH 3
/*! ALU opcode field bitwidth */
#define VTA_ALU_OPCODE_BIT_WIDTH 2
/*! Opcode: load encoding */
#define VTA_OPCODE_LOAD 0
/*! Opcode: store encoding */
#define VTA_OPCODE_STORE 1
/*! Opcode: GEMM encoding */
#define VTA_OPCODE_GEMM 2
/*! Opcode: finish encoding */
#define VTA_OPCODE_FINISH 3
/*! Opcode: ALU encoding */
#define VTA_OPCODE_ALU 4
/*! ALU opcode: unary min op */
#define VTA_ALU_OPCODE_MIN 0
/*! ALU opcode: unary max op */
#define VTA_ALU_OPCODE_MAX 1
/*! ALU opcode: binary add op */
#define VTA_ALU_OPCODE_ADD 2
/*! ALU opcode: shift right by immediate op */
#define VTA_ALU_OPCODE_SHR 3
/*! Memory type field bitwidth */
#define VTA_MEMOP_ID_BIT_WIDTH 2
/*! Load/Store Instruction: DRAM address width*/
#define VTA_MEMOP_SRAM_ADDR_BIT_WIDTH 16
/*! Load/Store Instruction: DRAM address width*/
#define VTA_MEMOP_DRAM_ADDR_BIT_WIDTH 32
/*! Load/Store Instruction: transfer size width*/
#define VTA_MEMOP_SIZE_BIT_WIDTH 16
/*! Load/Store Instruction: stride size width*/
#define VTA_MEMOP_STRIDE_BIT_WIDTH 16
/*! Load/Store Instruction: padding width*/
#define VTA_MEMOP_PAD_BIT_WIDTH 4
/*! Load/Store Instruction: padding value encoding width*/
#define VTA_MEMOP_PAD_VAL_BIT_WIDTH 2
/*! GEMM/ALU Instruction: loop max iter bits */
#define VTA_LOOP_ITER_WIDTH 14
/*! ALU Instruction: immediate bitwidth*/
#define VTA_ALUOP_IMM_BIT_WIDTH 16
/*! ALU Instruction: shift arg bitwidth*/
#define VTA_SHR_ARG_BIT_WIDTH (VTA_LOG_ACC_WIDTH)
/*! ALU Instruction: multiply arg bitwidth*/
#define VTA_MUL_ARG_BIT_WIDTH 8
/*! Mem ID constant: uop memory */
#define VTA_MEM_ID_UOP 0
/*! Mem ID constant: weight memory */
#define VTA_MEM_ID_WGT 1
/*! Mem ID constant: input memory */
#define VTA_MEM_ID_INP 2
/*! Mem ID constant: accumulator/bias memory */
#define VTA_MEM_ID_ACC 3
/*! Mem ID constant: output store buffer */
#define VTA_MEM_ID_OUT 4
/*! GEMM Micro-op start position of the acc_idx field */
#define VTA_UOP_GEM_0_0 0
/*! GEMM Micro-op end position of the acc_idx field */
#define VTA_UOP_GEM_0_1 (VTA_UOP_GEM_0_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
/*! GEMM Micro-op start position of the inp_idx field */
#define VTA_UOP_GEM_1_0 (VTA_UOP_GEM_0_1 + 1)
/*! GEMM Micro-op end position of the inp_idx field */
#define VTA_UOP_GEM_1_1 (VTA_UOP_GEM_1_0 + VTA_LOG_INP_BUFF_DEPTH - 1)
/*! GEMM Micro-op start position of the wgt_idx field */
#define VTA_UOP_GEM_2_0 (VTA_UOP_GEM_1_1 + 1)
/*! GEMM Micro-op end position of the wgt_idx field */
#define VTA_UOP_GEM_2_1 (VTA_UOP_GEM_2_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)
/*! GEMM Micro-op start position of the acc_idx field */
#define VTA_UOP_ALU_0_0 0
/*! GEMM Micro-op end position of the acc_idx field */
#define VTA_UOP_ALU_0_1 (VTA_UOP_ALU_0_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
/*! GEMM Micro-op start position of the inp_idx field */
#define VTA_UOP_ALU_1_0 (VTA_UOP_ALU_0_1 + 1)
/*! GEMM Micro-op end position of the inp_idx field */
#define VTA_UOP_ALU_1_1 (VTA_UOP_ALU_1_0 + VTA_LOG_INP_BUFF_DEPTH - 1)
/*! \brief VTA generic instruction */
typedef struct {
/*! \brief The instruction opcode */
uint64_t opcode : VTA_OPCODE_BIT_WIDTH;
/*! \brief Unused in this instruction */
uint64_t pop_prev_dep : 1;
/*! \brief Pop dependence token from GEMM stage */
uint64_t pop_next_dep : 1;
/*! \brief Unused in this instruction */
uint64_t push_prev_dep : 1;
/*! \brief Push dependence token to GEMM stage */
uint64_t push_next_dep : 1;
/*! \brief Padding */
uint64_t pad_0 : 64 - VTA_OPCODE_BIT_WIDTH - 4;
/*! \brief Padding */
uint64_t pad_1 : 64;
} VTAGenericInsn;
/*! \brief VTA load/store instruction
* Load/store instruction can describe a 2D strided access pattern
* with padding, which can be useful to perform spatial padding
* on the fly on a tensor on which to perform 2D convolution.
* For instance if we try to load a 4x4 spatial tile from a 16x16
* matrix with padding of size 1 on all dimensions:
* y_size = 4, x_size = 4, x_stride = 16, y_pad_0 = 1, y_pad_1 = 1,
* x_pad_0 = 1, x_pad_1 = 1.
*/
typedef struct {
/*! \brief The instruction opcode */
uint64_t opcode : VTA_OPCODE_BIT_WIDTH;
/*! \brief Unused in this instruction */
uint64_t pop_prev_dep : 1;
/*! \brief Pop dependence token from GEMM stage */
uint64_t pop_next_dep : 1;
/*! \brief Unused in this instruction */
uint64_t push_prev_dep : 1;
/*! \brief Push dependence token to GEMM stage */
uint64_t push_next_dep : 1;
/*! \brief Source/destination SRAM for store/load instruction */
uint64_t memory_type : VTA_MEMOP_ID_BIT_WIDTH;
/*! \brief SRAM base address (pointer to memory elem type) */
uint64_t sram_base : VTA_MEMOP_SRAM_ADDR_BIT_WIDTH;
/*! \brief DRAM base address (pointer to memory elem type) */
uint64_t dram_base : VTA_MEMOP_DRAM_ADDR_BIT_WIDTH;
/*! \brief 2D access pattern: y-size */
uint64_t y_size : VTA_MEMOP_SIZE_BIT_WIDTH;
/*! \brief 2D access pattern: x-size (in terms of memory elements) */
uint64_t x_size : VTA_MEMOP_SIZE_BIT_WIDTH;
/*! \brief 2D access pattern: x-stride (in terms of memory elements) */
uint64_t x_stride : VTA_MEMOP_STRIDE_BIT_WIDTH;
/*! \brief 2D access pattern: start padding along y dimension */
uint64_t y_pad_0 : VTA_MEMOP_PAD_BIT_WIDTH;
/*! \brief 2D access pattern: end padding along y dimension */
uint64_t y_pad_1 : VTA_MEMOP_PAD_BIT_WIDTH;
/*! \brief 2D access pattern: start padding along x dimension */
uint64_t x_pad_0 : VTA_MEMOP_PAD_BIT_WIDTH;
/*! \brief 2D access pattern: end padding along x dimension */
uint64_t x_pad_1 : VTA_MEMOP_PAD_BIT_WIDTH;
} VTAMemInsn;
/*! \brief VTA GEMM instruction
* GEMM instruction is implemented by executing a sequence of micro-operations
* that is read in the local micro-op memory, delimited by \a uop_bgn and
* \a uop_end. For improved storage-efficiency, the micro-operations can be
* executed in a 2-level nested loop as follows:
* \code{.cpp}
* for (i = 0; i < iter_out; i++) {
* for (j = 0; j < iter_in; j++) {
* for (k = uop_bgn; k < uop_end; k++) {
* // Read micro op
* uop_T uop = uop_mem[k];
* // Read in memory indices
* acc_idx_T acc_idx = uop.dst_idx;
* inp_idx_T inp_idx = uop.inp_idx;
* wgt_idx_T wgt_idx = uop.wgt_idx;
* // Update those indices with the following affine functions
* acc_idx += iter_in * dst_factor_in + iter_out * dst_factor_out;
* inp_idx += iter_in * src_factor_in + iter_out * src_factor_out;
* wgt_idx += iter_in * wgt_factor_in + iter_out * wgt_factor_out;
* // Perform GEMM operation
* acc_mem[acc_idx] += dot(inp_mem[inp_idx], wgt[wgt_idx]);
* }
* }
* }
* \endcode
*
*/
typedef struct {
/*! \brief The instruction opcode */
uint64_t opcode : VTA_OPCODE_BIT_WIDTH;
/*! \brief Pop dependence token from load stage */
uint64_t pop_prev_dep : 1;
/*! \brief Pop dependence token from store stage */
uint64_t pop_next_dep : 1;
/*! \brief Push dependence token to load stage */
uint64_t push_prev_dep : 1;
/*! \brief Push dependence token to store stage */
uint64_t push_next_dep : 1;
/*! \brief Reset register */
uint64_t reset_reg : 1;
/*! \brief Micro-op begin address */
uint64_t uop_bgn : VTA_LOG_UOP_BUFF_DEPTH;
/*! \brief Micro-op end address */
uint64_t uop_end : VTA_LOG_UOP_BUFF_DEPTH + 1;
/*! \brief Iterations in the outer uop execution loop */
uint64_t iter_out : VTA_LOOP_ITER_WIDTH;
/*! \brief Iterations in the inner uop execution loop */
uint64_t iter_in : VTA_LOOP_ITER_WIDTH;
/*! \brief Outer loop accumulator memory index factor */
uint64_t dst_factor_out : VTA_LOG_ACC_BUFF_DEPTH;
/*! \brief Inner loop accumulator memory index factor */
uint64_t dst_factor_in : VTA_LOG_ACC_BUFF_DEPTH;
/*! \brief Outer loop input memory index factor */
uint64_t src_factor_out : VTA_LOG_INP_BUFF_DEPTH;
/*! \brief Inner loop input memory index factor */
uint64_t src_factor_in : VTA_LOG_INP_BUFF_DEPTH;
/*! \brief Outer loop weight memory index factor */
uint64_t wgt_factor_out : VTA_LOG_WGT_BUFF_DEPTH;
/*! \brief Inner loop weight memory index factor */
uint64_t wgt_factor_in : VTA_LOG_WGT_BUFF_DEPTH;
} VTAGemInsn;
/*! \brief VTA ALU instruction
* ALU instruction is implemented by executing a sequence of micro-operations
* that is read in the local micro-op memory, delimited by \a uop_bgn and
* \a uop_end. For improved storage-efficiency, the micro-operations can be
* executed in a 2-level nested loop as follows:
* \code{.cpp}
* for (i = 0; i < iter_out; i++) {
* for (j = 0; j < iter_in; j++) {
* for (k = uop_bgn; k < uop_end; k++) {
* // Read micro op
* uop_T uop = uop_mem[k];
* // Read in memory indices
* acc_idx_T dst_idx = uop.dst_idx;
* inp_idx_T src_idx = uop.inp_idx;
* // Update those indices with the following affine functions
* dst_idx += iter_in * dst_factor_in + iter_out * dst_factor_out;
* src_idx += iter_in * src_factor_in + iter_out * src_factor_out;
* // Perform ALU operation
* if (use_imm) {
* acc_mem[dst_idx] = alu_op(alu_opcode, acc_mem[dst_idx], imm);
* } else {
* acc_mem[dst_idx] = alu_op(alu_opcode, acc_mem[dst_idx], acc_mem[src_idx]);
* }
* }
* }
* }
* \endcode
*
*/
typedef struct {
/*! \brief The instruction opcode */
uint64_t opcode : VTA_OPCODE_BIT_WIDTH;
/*! \brief Pop dependence token from load stage */
uint64_t pop_prev_dep : 1;
/*! \brief Pop dependence token from store stage */
uint64_t pop_next_dep : 1;
/*! \brief Push dependence token to load stage */
uint64_t push_prev_dep : 1;
/*! \brief Push dependence token to store stage */
uint64_t push_next_dep : 1;
/*! \brief Reset register */
uint64_t reset_reg : 1;
/*! \brief Micro-op begin address */
uint64_t uop_bgn : VTA_LOG_UOP_BUFF_DEPTH;
/*! \brief Micro-op end address */
uint64_t uop_end : VTA_LOG_UOP_BUFF_DEPTH + 1;
/*! \brief Iterations in the outer uop execution loop */
uint64_t iter_out : VTA_LOOP_ITER_WIDTH;
/*! \brief Iterations in the inner uop execution loop */
uint64_t iter_in : VTA_LOOP_ITER_WIDTH;
/*! \brief Outer loop accumulator memory destination index factor */
uint64_t dst_factor_out : VTA_LOG_ACC_BUFF_DEPTH;
/*! \brief Inner loop accumulator memory destination index factor */
uint64_t dst_factor_in : VTA_LOG_ACC_BUFF_DEPTH;
/*! \brief Outer loop accumulator memory source index factor */
uint64_t src_factor_out : VTA_LOG_INP_BUFF_DEPTH;
/*! \brief Inner loop accumulator memory source index factor */
uint64_t src_factor_in : VTA_LOG_INP_BUFF_DEPTH;
/*! \brief ALU opcode */
uint64_t alu_opcode : VTA_ALU_OPCODE_BIT_WIDTH;
/*! \brief Use immediate is true */
uint64_t use_imm : 1;
/*! \brief Immediate value: allow negative value */
int64_t imm : VTA_ALUOP_IMM_BIT_WIDTH;
} VTAAluInsn;
/*! \brief VTA ALU instruction converter */
union VTAInsn {
/*! \brief VTA generic instruction */
VTAGenericInsn generic;
/*! \brief VTA load/store instruction */
VTAMemInsn mem;
/*! \brief VTA GEMM instruction */
VTAGemInsn gemm;
/*! \brief VTA ALU instruction */
VTAAluInsn alu;
};
/*! \brief VTA micro-op for GEMM/ALU instruction */
typedef struct {
/*! \brief Destination index (indexes accum buffer) */
uint32_t dst_idx : VTA_LOG_ACC_BUFF_DEPTH;
/*! \brief Source index (indexes input buffer for GEMM or accum buffer for ALU) */
uint32_t src_idx : VTA_LOG_INP_BUFF_DEPTH;
/*! \brief Weight index (indexes weight buffer) */
uint32_t wgt_idx : VTA_LOG_WGT_BUFF_DEPTH;
} VTAUop;
#ifdef __cplusplus
}
#endif
#endif // VTA_HW_SPEC_H_