blob: d4a2a2dd98f85280ffe133adbec877baeede9558 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*!
* \file vta.h
* \brief Type definitions and prototype for VTA HLS design.
*/
#ifndef VTA_VTA_H_
#define VTA_VTA_H_
#include <ap_axi_sdata.h>
#include <ap_int.h>
#include <assert.h>
#include <hls_stream.h>
#include <vta/hw_spec.h>
/*!
* Define HLS stream depth
*/
#define PRAGMA_SUB(x) _Pragma (#x)
#define PRAGMA_HLS(x) PRAGMA_SUB(x)
#define STREAM_IN_DEPTH 8
/* \typedef bus_T memory bus datatype*/
typedef ap_uint<VTA_BUS_WIDTH> bus_T;
/* \typedef uop_T Micro-op datatype*/
typedef ap_uint<VTA_UOP_WIDTH> uop_T;
/* \typedef inp_T Input datatype*/
typedef ap_int<VTA_INP_WIDTH> inp_T;
/* \typedef wgt_T Weight datatype*/
typedef ap_int<VTA_WGT_WIDTH> wgt_T;
/* \typedef out_T Output datatype*/
typedef ap_int<VTA_OUT_WIDTH> out_T;
/* \typedef acc_T Accumulator datatype*/
typedef ap_int<VTA_ACC_WIDTH> acc_T;
/* \typedef mul_T Multiplier output datatype*/
typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+1> mul_T;
/* \typedef sum_T GEMM accumulator datatype*/
typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+VTA_LOG_BLOCK_IN+1> sum_T;
/* \typedef uop_idx_T Micro-op SRAM index datatype*/
typedef ap_uint<VTA_LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
/* \typedef inp_idx_T Input SRAM index datatype*/
typedef ap_uint<VTA_LOG_INP_BUFF_DEPTH+1> inp_idx_T;
/* \typedef wgt_idx_T Weight SRAM index datatype*/
typedef ap_uint<VTA_LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
/* \typedef acc_idx_T Accumulator SRAM index datatype*/
typedef ap_uint<VTA_LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
/* \typedef opcode_T Opcode datatype*/
typedef ap_uint<VTA_OPCODE_BIT_WIDTH> opcode_T;
/* \typedef insn_T Instruction datatype*/
typedef ap_uint<VTA_INS_WIDTH> insn_T;
/* \typedef loop_T Loop bound datatype*/
typedef ap_uint<VTA_LOOP_ITER_WIDTH> loop_T;
/* \typedef memop_id_T Memory operation ID datatype*/
typedef ap_uint<VTA_MEMOP_ID_BIT_WIDTH> memop_id_T;
/* \typedef memop_sram_T Memory operation SRAM index datatype*/
typedef ap_uint<VTA_MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
/* \typedef memop_dram_T Memory operation DRAM index datatype*/
typedef ap_uint<VTA_MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
/* \typedef memop_size_T Memory operation range datatype*/
typedef ap_uint<VTA_MEMOP_SIZE_BIT_WIDTH> memop_size_T;
/* \typedef memop_stride_T Memory operation stride datatype*/
typedef ap_uint<VTA_MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
/* \typedef memop_pad_T Memory operation pad width datatype*/
typedef ap_uint<VTA_MEMOP_PAD_BIT_WIDTH> memop_pad_T;
/* \typedef aluop_opcode_T ALU operation opcode datatype*/
typedef ap_uint<VTA_ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
/* \typedef aluop_imm_T ALU operation immediate datatype*/
typedef ap_int<VTA_ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
/* \typedef aluop_shr_arg_T ALU operation shift right immediate datatype*/
typedef ap_int<VTA_SHR_ARG_BIT_WIDTH> aluop_shr_arg_T;
/* \typedef aluop_mul_arg_T ALU operation multiply datatype*/
typedef ap_int<VTA_MUL_ARG_BIT_WIDTH> aluop_mul_arg_T;
/*!
* \brief Fetch module.
* Reads in \a insn_count instructions via DMA and pushes them to the
* appropriate load, gemm or store queue.
* \param insns Instruction data base address in DRAM. AXI-4 master port.
* \param insn_count Total instruction count. AXI-lite memory mapped register.
* \param load_queue Load instruction queue. AXI-stream FIFO.
* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
* \param store_queue Store instruction queue. AXI-stream FIFO.
*/
void fetch(
uint32_t insn_count,
volatile insn_T *insns,
hls::stream<insn_T> &load_queue,
hls::stream<insn_T> &gemm_queue,
hls::stream<insn_T> &store_queue);
/*!
* \brief Load module.
* Reads in load instructions from the load queue, and performs appropriate
* DMA load operation to the \a wgt_mem and \a inp_mem SRAM buffers from DRAM.
* Updates dependence queues accordingly.
* \param inputs Input data base address in DRAM. AXI-4 master port.
* \param weights Weight data base address in DRAM. AXI-4 master port.
* \param load_queue Load instruction queue. AXI-stream FIFO.
* \param g2l_dep_queue Dependence queue from GEMM to load stage.
* AXI-stream FIFO.
* \param l2g_dep_queue Dependence queue from load to GEMM stage.
* AXI-stream FIFO.
* \param inp_mem Local input SRAM buffer. Write only single port BRAM.
* \param wgt_mem Local weight SRAM buffer. Write only single port BRAM.
*/
void load(
volatile bus_T *inputs,
volatile bus_T *weights,
hls::stream<insn_T> &load_queue,
hls::stream<bool> &g2l_dep_queue,
hls::stream<bool> &l2g_dep_queue,
bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO]);
/*!
* \brief Compute module.
* Reads in GEMM instructions from the gemm queue, and performs appropriate
* GEMM/ALU instructions. Reads in data from the \a wgt_mem and \a inp_mem,
* and writes computation results into the \a out_mem. Updates dependence
* queues accordingly.
* \param done Signal that indicates that VLA is done. AXI-lite memory mapped
* register.
* \param uops Micro-op data base address in DRAM. AXI-4 master port.
* \param biases Bias data base address in DRAM. AXI-4 master port.
* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
* \param l2g_dep_queue Dependence queue from load to gemm stage.
* AXI-stream FIFO.
* \param s2g_dep_queue Dependence queue from store to gemm stage.
* AXI-stream FIFO.
* \param g2l_dep_queue Dependence queue from gemm to load stage.
* AXI-stream FIFO.
* \param g2s_dep_queue Dependence queue from gemm to store stage.
* AXI-stream FIFO.
* \param inp_mem Local input SRAM buffer. Read only single port BRAM.
* \param wgt_mem Local weight SRAM buffer. Read only single port BRAM.
* \param out_mem Local output SRAM buffer. Write only single port BRAM.
*/
void compute(
volatile uint32_t &done,
volatile uop_T *uops,
volatile bus_T *biases,
hls::stream<insn_T> &gemm_queue,
hls::stream<bool> &l2g_dep_queue,
hls::stream<bool> &s2g_dep_queue,
hls::stream<bool> &g2l_dep_queue,
hls::stream<bool> &g2s_dep_queue,
bus_T inp_mem[VTA_INP_BUFF_DEPTH][INP_MAT_AXI_RATIO],
bus_T wgt_mem[VTA_WGT_BUFF_DEPTH][WGT_MAT_AXI_RATIO],
bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]);
/*!
* \brief Store module.
* Reads in store instructions from the store queue, and performs appropriate
* store instructions from the output buffer in SRAM to DRAM. Updates dependence
* queues accordingly.
* \param outputs Output data base address in DRAM. AXI-4 master port.
* \param store_queue Store instruction queue. AXI-stream FIFO.
* \param g2s_dep_queue Dependence queue from gemm to store stage.
* AXI-stream FIFO.
* \param s2g_dep_queue Dependence queue from store to gemm stage.
* AXI-stream FIFO.
* \param out_mem Local output SRAM buffer. Read only single port BRAM.
*/
void store(
volatile bus_T *outputs,
hls::stream<insn_T> &store_queue,
hls::stream<bool> &g2s_dep_queue,
hls::stream<bool> &s2g_dep_queue,
bus_T out_mem[VTA_ACC_BUFF_DEPTH][OUT_MAT_AXI_RATIO]);
/*!
* \brief VTA wrapper for simulation purpose only.
* Orchestrates dataflow execution of the fetch, load, GEMM and store stages.
* \param insn_count Total instruction count. AXI-lite memory mapped register.
* \param insns Instruction data base address in DRAM. AXI-4 master port.
* \param uops Micro-op data base address in DRAM. AXI-4 master port.
* \param inputs Input data base address in DRAM. AXI-4 master port.
* \param weights Weight data base address in DRAM. AXI-4 master port.
* \param biases Bias data base address in DRAM. AXI-4 master port.
* \param outputs Output data base address in DRAM. AXI-4 master port.
*/
void vta(
uint32_t insn_count,
volatile insn_T *insns,
volatile uop_T *uops,
volatile bus_T *inputs,
volatile bus_T *weights,
volatile bus_T *biases,
volatile bus_T *outputs);
#endif // VTA_VTA_H_