include/vta/hw_spec.h - tvm-vta - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 /*!
  * \file hw_spec.h
  * \brief Preprocessor definitions for VTA HLS design and runtime.
  */

 #ifndef VTA_HW_SPEC_H_
 #define VTA_HW_SPEC_H_

 #ifdef __cplusplus
 extern "C" {
 #endif

 #include <stdint.h>

 /*! Memory bus width */
 #define VTA_BUS_WIDTH (1 << VTA_LOG_BUS_WIDTH)

 /*! log2 of instruction data type width */
 #define VTA_LOG_INS_WIDTH 7
 /*! Instruction data type width */
 #define VTA_INS_WIDTH (1 << VTA_LOG_INS_WIDTH)
 /*! log2 of micro op data type width */
 #define VTA_LOG_UOP_WIDTH 5
 /*! Micro Op data type width */
 #define VTA_UOP_WIDTH (1 << VTA_LOG_UOP_WIDTH)
 /*! Weight data type width */
 #define VTA_WGT_WIDTH (1 << VTA_LOG_WGT_WIDTH)
 /*! Input data type width */
 #define VTA_INP_WIDTH (1 << VTA_LOG_INP_WIDTH)
 /*! Output data type width */
 #define VTA_OUT_WIDTH (1 << VTA_LOG_OUT_WIDTH)
 /*! Accumulator data type width */
 #define VTA_ACC_WIDTH (1 << VTA_LOG_ACC_WIDTH)

 /*! Batch size (corresponds to A in (A,B)x(B,C) mat mult)*/
 #define VTA_BATCH (1 << VTA_LOG_BATCH)
 /*! Blocking factor of inner most loop (corresponds to B in (A,B)x(B,C) mat mult) */
 #define VTA_BLOCK_IN (1 << VTA_LOG_BLOCK_IN)
 /*! Blocking factor of the outer loop (corresponds to C in (A,B)x(B,C) mat mult) */
 #define VTA_BLOCK_OUT (1 << VTA_LOG_BLOCK_OUT)

 /*! On-chip micro-op buffer size in B */
 #define VTA_UOP_BUFF_SIZE (1 << VTA_LOG_UOP_BUFF_SIZE)
 /*! On-chip weight buffer size in B */
 #define VTA_WGT_BUFF_SIZE (1 << VTA_LOG_WGT_BUFF_SIZE)
 /*! On-chip activation buffer size in B */
 #define VTA_INP_BUFF_SIZE (1 << VTA_LOG_INP_BUFF_SIZE)
 /*! On-chip accumulator buffer size in B */
 #define VTA_ACC_BUFF_SIZE (1 << VTA_LOG_ACC_BUFF_SIZE)

 /*! Input vector size in bits */
 #define VTA_INP_MATRIX_WIDTH (VTA_INP_WIDTH * VTA_BATCH * VTA_BLOCK_IN)
 /*! Weight vector size in bits */
 #define VTA_WGT_MATRIX_WIDTH (VTA_WGT_WIDTH * VTA_BLOCK_OUT * VTA_BLOCK_IN)
 /*! Accumulator vector size in bits */
 #define VTA_ACC_MATRIX_WIDTH (VTA_ACC_WIDTH * VTA_BATCH * VTA_BLOCK_OUT)
 /*! Output vector size in bits */
 #define VTA_OUT_MATRIX_WIDTH (VTA_OUT_WIDTH * VTA_BATCH * VTA_BLOCK_OUT)

 /*! Ratio between input matrix size and axi width */
 #define INP_MAT_AXI_RATIO (VTA_INP_MATRIX_WIDTH / VTA_BUS_WIDTH)
 /*! Ratio between weight matrix size and axi width */
 #define WGT_MAT_AXI_RATIO (VTA_WGT_MATRIX_WIDTH / VTA_BUS_WIDTH)
 /*! Ratio between accumulator matrix size and axi width */
 #define ACC_MAT_AXI_RATIO (VTA_ACC_MATRIX_WIDTH / VTA_BUS_WIDTH)
 /*! Ratio between output matrix size and axi width */
 #define OUT_MAT_AXI_RATIO (VTA_OUT_MATRIX_WIDTH / VTA_BUS_WIDTH)

 /*! Size of instruction buffer element in B */
 #define VTA_INS_ELEM_BYTES (VTA_INS_WIDTH / 8)
 /*! Size of uop buffer element in B*/
 #define VTA_UOP_ELEM_BYTES (VTA_UOP_WIDTH / 8)
 /*! Size of activation buffer element in B*/
 #define VTA_INP_ELEM_BYTES (VTA_INP_MATRIX_WIDTH / 8)
 /*! Size of weight buffer element in B*/
 #define VTA_WGT_ELEM_BYTES (VTA_WGT_MATRIX_WIDTH / 8)
 /*! Size of accumulator buffer element in B*/
 #define VTA_ACC_ELEM_BYTES (VTA_ACC_MATRIX_WIDTH / 8)
 /*! Size of output buffer element in B*/
 #define VTA_OUT_ELEM_BYTES (VTA_OUT_MATRIX_WIDTH / 8)

 /*! On-chip micro-op buffer depth */
 #define VTA_UOP_BUFF_DEPTH (VTA_UOP_BUFF_SIZE / VTA_UOP_ELEM_BYTES)
 /*! log2 of on-chip micro-op buffer depth */
 #define VTA_LOG_UOP_BUFF_DEPTH (VTA_LOG_UOP_BUFF_SIZE - VTA_LOG_UOP_WIDTH + 3)
 // ! \brief On-chip weight buffer depth
 #define VTA_WGT_BUFF_DEPTH (VTA_WGT_BUFF_SIZE / VTA_WGT_ELEM_BYTES)
 /*! log2 of weight micro-op buffer depth */
 #define VTA_LOG_WGT_BUFF_DEPTH \
     (VTA_LOG_WGT_BUFF_SIZE - VTA_LOG_BLOCK_OUT - VTA_LOG_BLOCK_IN - VTA_LOG_WGT_WIDTH + 3)
 /*! On-chip activation buffer depth */
 #define VTA_INP_BUFF_DEPTH (VTA_INP_BUFF_SIZE / VTA_INP_ELEM_BYTES)
 /*! log2 of activation micro-op buffer depth */
 #define VTA_LOG_INP_BUFF_DEPTH \
     (VTA_LOG_INP_BUFF_SIZE - VTA_LOG_BATCH - VTA_LOG_BLOCK_IN - VTA_LOG_INP_WIDTH + 3)
 /*! On-chip accumulator buffer depth */
 #define VTA_ACC_BUFF_DEPTH (VTA_ACC_BUFF_SIZE / VTA_ACC_ELEM_BYTES)
 /*! log2 of on-chip accumulator buffer depth */
 #define VTA_LOG_ACC_BUFF_DEPTH \
     (VTA_LOG_ACC_BUFF_SIZE - VTA_LOG_BATCH - VTA_LOG_BLOCK_OUT - VTA_LOG_ACC_WIDTH + 3)

 /*! Instruction opcode field bitwidth */
 #define VTA_OPCODE_BIT_WIDTH 3
 /*! ALU opcode field bitwidth */
 #define VTA_ALU_OPCODE_BIT_WIDTH 2

 /*! Opcode: load encoding */
 #define VTA_OPCODE_LOAD 0
 /*! Opcode: store encoding */
 #define VTA_OPCODE_STORE 1
 /*! Opcode: GEMM encoding */
 #define VTA_OPCODE_GEMM 2
 /*! Opcode: finish encoding */
 #define VTA_OPCODE_FINISH 3
 /*! Opcode: ALU encoding */
 #define VTA_OPCODE_ALU 4

 /*! ALU opcode: unary min op */
 #define VTA_ALU_OPCODE_MIN 0
 /*! ALU opcode: unary max op */
 #define VTA_ALU_OPCODE_MAX 1
 /*! ALU opcode: binary add op */
 #define VTA_ALU_OPCODE_ADD 2
 /*! ALU opcode: shift right by immediate op */
 #define VTA_ALU_OPCODE_SHR 3

 /*! Memory type field bitwidth */
 #define VTA_MEMOP_ID_BIT_WIDTH 2
 /*! Load/Store Instruction: DRAM address width*/
 #define VTA_MEMOP_SRAM_ADDR_BIT_WIDTH 16
 /*! Load/Store Instruction: DRAM address width*/
 #define VTA_MEMOP_DRAM_ADDR_BIT_WIDTH 32
 /*! Load/Store Instruction: transfer size width*/
 #define VTA_MEMOP_SIZE_BIT_WIDTH 16
 /*! Load/Store Instruction: stride size width*/
 #define VTA_MEMOP_STRIDE_BIT_WIDTH 16
 /*! Load/Store Instruction: padding width*/
 #define VTA_MEMOP_PAD_BIT_WIDTH 4
 /*! Load/Store Instruction: padding value encoding width*/
 #define VTA_MEMOP_PAD_VAL_BIT_WIDTH 2
 /*! GEMM/ALU Instruction: loop max iter bits */
 #define VTA_LOOP_ITER_WIDTH 14
 /*! ALU Instruction: immediate bitwidth*/
 #define VTA_ALUOP_IMM_BIT_WIDTH 16
 /*! ALU Instruction: shift arg bitwidth*/
 #define VTA_SHR_ARG_BIT_WIDTH (VTA_LOG_ACC_WIDTH)
 /*! ALU Instruction: multiply arg bitwidth*/
 #define VTA_MUL_ARG_BIT_WIDTH 8

 /*! Mem ID constant: uop memory */
 #define VTA_MEM_ID_UOP 0
 /*! Mem ID constant: weight memory */
 #define VTA_MEM_ID_WGT 1
 /*! Mem ID constant: input memory */
 #define VTA_MEM_ID_INP 2
 /*! Mem ID constant: accumulator/bias memory */
 #define VTA_MEM_ID_ACC 3
 /*! Mem ID constant: output store buffer */
 #define VTA_MEM_ID_OUT 4

 /*! GEMM Micro-op start position of the acc_idx field */
 #define VTA_UOP_GEM_0_0 0
 /*! GEMM Micro-op end position of the acc_idx field */
 #define VTA_UOP_GEM_0_1 (VTA_UOP_GEM_0_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
 /*! GEMM Micro-op start position of the inp_idx field */
 #define VTA_UOP_GEM_1_0 (VTA_UOP_GEM_0_1 + 1)
 /*! GEMM Micro-op end position of the inp_idx field */
 #define VTA_UOP_GEM_1_1 (VTA_UOP_GEM_1_0 + VTA_LOG_INP_BUFF_DEPTH - 1)
 /*! GEMM Micro-op start position of the wgt_idx field */
 #define VTA_UOP_GEM_2_0 (VTA_UOP_GEM_1_1 + 1)
 /*! GEMM Micro-op end position of the wgt_idx field */
 #define VTA_UOP_GEM_2_1 (VTA_UOP_GEM_2_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)

 /*! GEMM Micro-op start position of the acc_idx field */
 #define VTA_UOP_ALU_0_0 0
 /*! GEMM Micro-op end position of the acc_idx field */
 #define VTA_UOP_ALU_0_1 (VTA_UOP_ALU_0_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
 /*! GEMM Micro-op start position of the inp_idx field */
 #define VTA_UOP_ALU_1_0 (VTA_UOP_ALU_0_1 + 1)
 /*! GEMM Micro-op end position of the inp_idx field */
 #define VTA_UOP_ALU_1_1 (VTA_UOP_ALU_1_0 + VTA_LOG_INP_BUFF_DEPTH - 1)

 /*! \brief VTA generic instruction */
 typedef struct {
   /*! \brief The instruction opcode */
   uint64_t opcode         : VTA_OPCODE_BIT_WIDTH;
   /*! \brief Unused in this instruction */
   uint64_t pop_prev_dep   : 1;
   /*! \brief Pop dependence token from GEMM stage */
   uint64_t pop_next_dep   : 1;
   /*! \brief Unused in this instruction */
   uint64_t push_prev_dep  : 1;
   /*! \brief Push dependence token to GEMM stage */
   uint64_t push_next_dep  : 1;
   /*! \brief Padding */
   uint64_t pad_0          : 64 - VTA_OPCODE_BIT_WIDTH - 4;
   /*! \brief Padding */
   uint64_t pad_1          : 64;
 } VTAGenericInsn;

 /*! \brief VTA load/store instruction
 *   Load/store instruction can describe a 2D strided access pattern
 *   with padding, which can be useful to perform spatial padding
 *   on the fly on a tensor on which to perform 2D convolution.
 *   For instance if we try to load a 4x4 spatial tile from a 16x16
 *   matrix with padding of size 1 on all dimensions:
 *   y_size = 4, x_size = 4, x_stride = 16, y_pad_0 = 1, y_pad_1 = 1,
 *   x_pad_0 = 1, x_pad_1 = 1.
 */
 typedef struct {
   /*! \brief The instruction opcode */
   uint64_t opcode         : VTA_OPCODE_BIT_WIDTH;
   /*! \brief Unused in this instruction */
   uint64_t pop_prev_dep   : 1;
   /*! \brief Pop dependence token from GEMM stage */
   uint64_t pop_next_dep   : 1;
   /*! \brief Unused in this instruction */
   uint64_t push_prev_dep  : 1;
   /*! \brief Push dependence token to GEMM stage */
   uint64_t push_next_dep  : 1;
   /*! \brief Source/destination SRAM for store/load instruction */
   uint64_t memory_type    : VTA_MEMOP_ID_BIT_WIDTH;
   /*! \brief SRAM base address (pointer to memory elem type) */
   uint64_t sram_base      : VTA_MEMOP_SRAM_ADDR_BIT_WIDTH;
   /*! \brief DRAM base address (pointer to memory elem type) */
   uint64_t dram_base      : VTA_MEMOP_DRAM_ADDR_BIT_WIDTH;
   /*! \brief 2D access pattern: y-size */
   uint64_t y_size         : VTA_MEMOP_SIZE_BIT_WIDTH;
   /*! \brief 2D access pattern: x-size (in terms of memory elements) */
   uint64_t x_size         : VTA_MEMOP_SIZE_BIT_WIDTH;
   /*! \brief 2D access pattern: x-stride (in terms of memory elements) */
   uint64_t x_stride       : VTA_MEMOP_STRIDE_BIT_WIDTH;
   /*! \brief 2D access pattern: start padding along y dimension */
   uint64_t y_pad_0        : VTA_MEMOP_PAD_BIT_WIDTH;
   /*! \brief 2D access pattern: end padding along y dimension */
   uint64_t y_pad_1        : VTA_MEMOP_PAD_BIT_WIDTH;
   /*! \brief 2D access pattern: start padding along x dimension */
   uint64_t x_pad_0        : VTA_MEMOP_PAD_BIT_WIDTH;
   /*! \brief 2D access pattern: end padding along x dimension */
   uint64_t x_pad_1        : VTA_MEMOP_PAD_BIT_WIDTH;
 } VTAMemInsn;

 /*! \brief VTA GEMM instruction
 *   GEMM instruction is implemented by executing a sequence of micro-operations
 *   that is read in the local micro-op memory, delimited by \a uop_bgn and
 *   \a uop_end. For improved storage-efficiency, the micro-operations can be
 *   executed in a 2-level nested loop as follows:
 *   \code{.cpp}
 *     for (i = 0; i < iter_out; i++) {
 *       for (j = 0; j < iter_in; j++) {
 *         for (k = uop_bgn; k < uop_end; k++) {
 *           // Read micro op
 *           uop_T uop = uop_mem[k];
 *           // Read in memory indices
 *           acc_idx_T acc_idx = uop.dst_idx;
 *           inp_idx_T inp_idx = uop.inp_idx;
 *           wgt_idx_T wgt_idx = uop.wgt_idx;
 *           // Update those indices with the following affine functions
 *           acc_idx += iter_in * dst_factor_in + iter_out * dst_factor_out;
 *           inp_idx += iter_in * src_factor_in + iter_out * src_factor_out;
 *           wgt_idx += iter_in * wgt_factor_in + iter_out * wgt_factor_out;
 *           // Perform GEMM operation
 *           acc_mem[acc_idx] += dot(inp_mem[inp_idx], wgt[wgt_idx]);
 *         }
 *       }
 *     }
 *   \endcode
 *
 */
 typedef struct {
   /*! \brief The instruction opcode */
   uint64_t opcode         : VTA_OPCODE_BIT_WIDTH;
   /*! \brief Pop dependence token from load stage */
   uint64_t pop_prev_dep   : 1;
   /*! \brief Pop dependence token from store stage */
   uint64_t pop_next_dep   : 1;
   /*! \brief Push dependence token to load stage */
   uint64_t push_prev_dep  : 1;
   /*! \brief Push dependence token to store stage */
   uint64_t push_next_dep  : 1;
   /*! \brief Reset register */
   uint64_t reset_reg      : 1;
   /*! \brief Micro-op begin address */
   uint64_t uop_bgn        : VTA_LOG_UOP_BUFF_DEPTH;
   /*! \brief Micro-op end address */
   uint64_t uop_end        : VTA_LOG_UOP_BUFF_DEPTH + 1;
   /*! \brief Iterations in the outer uop execution loop */
   uint64_t iter_out       : VTA_LOOP_ITER_WIDTH;
   /*! \brief Iterations in the inner uop execution loop */
   uint64_t iter_in        : VTA_LOOP_ITER_WIDTH;
   /*! \brief Outer loop accumulator memory index factor */
   uint64_t dst_factor_out : VTA_LOG_ACC_BUFF_DEPTH;
   /*! \brief Inner loop accumulator memory index factor */
   uint64_t dst_factor_in  : VTA_LOG_ACC_BUFF_DEPTH;
   /*! \brief Outer loop input memory index factor */
   uint64_t src_factor_out : VTA_LOG_INP_BUFF_DEPTH;
   /*! \brief Inner loop input memory index factor */
   uint64_t src_factor_in  : VTA_LOG_INP_BUFF_DEPTH;
   /*! \brief Outer loop weight memory index factor */
   uint64_t wgt_factor_out : VTA_LOG_WGT_BUFF_DEPTH;
   /*! \brief Inner loop weight memory index factor */
   uint64_t wgt_factor_in  : VTA_LOG_WGT_BUFF_DEPTH;
 } VTAGemInsn;

 /*! \brief VTA ALU instruction
 *   ALU instruction is implemented by executing a sequence of micro-operations
 *   that is read in the local micro-op memory, delimited by \a uop_bgn and
 *   \a uop_end. For improved storage-efficiency, the micro-operations can be
 *   executed in a 2-level nested loop as follows:
 *   \code{.cpp}
 *     for (i = 0; i < iter_out; i++) {
 *       for (j = 0; j < iter_in; j++) {
 *         for (k = uop_bgn; k < uop_end; k++) {
 *           // Read micro op
 *           uop_T uop = uop_mem[k];
 *           // Read in memory indices
 *           acc_idx_T dst_idx = uop.dst_idx;
 *           inp_idx_T src_idx = uop.inp_idx;
 *           // Update those indices with the following affine functions
 *           dst_idx += iter_in * dst_factor_in + iter_out * dst_factor_out;
 *           src_idx += iter_in * src_factor_in + iter_out * src_factor_out;
 *           // Perform ALU operation
 *           if (use_imm) {
 *             acc_mem[dst_idx] = alu_op(alu_opcode, acc_mem[dst_idx], imm);
 *           } else {
 *             acc_mem[dst_idx] = alu_op(alu_opcode, acc_mem[dst_idx], acc_mem[src_idx]);
 *           }
 *         }
 *       }
 *     }
 *   \endcode
 *
 */
 typedef struct {
   /*! \brief The instruction opcode */
   uint64_t opcode         : VTA_OPCODE_BIT_WIDTH;
   /*! \brief Pop dependence token from load stage */
   uint64_t pop_prev_dep   : 1;
   /*! \brief Pop dependence token from store stage */
   uint64_t pop_next_dep   : 1;
   /*! \brief Push dependence token to load stage */
   uint64_t push_prev_dep  : 1;
   /*! \brief Push dependence token to store stage */
   uint64_t push_next_dep  : 1;
   /*! \brief Reset register */
   uint64_t reset_reg      : 1;
   /*! \brief Micro-op begin address */
   uint64_t uop_bgn        : VTA_LOG_UOP_BUFF_DEPTH;
   /*! \brief Micro-op end address */
   uint64_t uop_end        : VTA_LOG_UOP_BUFF_DEPTH + 1;
   /*! \brief Iterations in the outer uop execution loop */
   uint64_t iter_out       : VTA_LOOP_ITER_WIDTH;
   /*! \brief Iterations in the inner uop execution loop */
   uint64_t iter_in        : VTA_LOOP_ITER_WIDTH;
   /*! \brief Outer loop accumulator memory destination index factor */
   uint64_t dst_factor_out : VTA_LOG_ACC_BUFF_DEPTH;
   /*! \brief Inner loop accumulator memory destination index factor */
   uint64_t dst_factor_in  : VTA_LOG_ACC_BUFF_DEPTH;
   /*! \brief Outer loop accumulator memory source index factor */
   uint64_t src_factor_out : VTA_LOG_INP_BUFF_DEPTH;
   /*! \brief Inner loop accumulator memory source index factor */
   uint64_t src_factor_in  : VTA_LOG_INP_BUFF_DEPTH;
   /*! \brief ALU opcode */
   uint64_t alu_opcode     : VTA_ALU_OPCODE_BIT_WIDTH;
   /*! \brief Use immediate is true */
   uint64_t use_imm        : 1;
   /*! \brief Immediate value: allow negative value */
   int64_t imm            : VTA_ALUOP_IMM_BIT_WIDTH;
 } VTAAluInsn;

 /*! \brief VTA ALU instruction converter */
 union VTAInsn {
   /*! \brief VTA generic instruction */
   VTAGenericInsn generic;
   /*! \brief VTA load/store instruction */
   VTAMemInsn mem;
   /*! \brief VTA GEMM instruction */
   VTAGemInsn gemm;
   /*! \brief VTA ALU instruction */
   VTAAluInsn alu;
 };

 /*! \brief VTA micro-op for GEMM/ALU instruction */
 typedef struct {
   /*! \brief Destination index (indexes accum buffer) */
   uint32_t dst_idx    : VTA_LOG_ACC_BUFF_DEPTH;
   /*! \brief Source index (indexes input buffer for GEMM or accum buffer for ALU) */
   uint32_t src_idx    : VTA_LOG_INP_BUFF_DEPTH;
   /*! \brief Weight index (indexes weight buffer) */
   uint32_t wgt_idx    : VTA_LOG_WGT_BUFF_DEPTH;
 } VTAUop;

 #ifdef __cplusplus
 }
 #endif
 #endif  // VTA_HW_SPEC_H_
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	/*!
	* \file hw_spec.h
	* \brief Preprocessor definitions for VTA HLS design and runtime.
	*/

	#ifndef VTA_HW_SPEC_H_
	#define VTA_HW_SPEC_H_

	#ifdef __cplusplus
	extern "C" {
	#endif

	#include <stdint.h>

	/! Memory bus width /
	#define VTA_BUS_WIDTH (1 << VTA_LOG_BUS_WIDTH)

	/! log2 of instruction data type width /
	#define VTA_LOG_INS_WIDTH 7
	/! Instruction data type width /
	#define VTA_INS_WIDTH (1 << VTA_LOG_INS_WIDTH)
	/! log2 of micro op data type width /
	#define VTA_LOG_UOP_WIDTH 5
	/! Micro Op data type width /
	#define VTA_UOP_WIDTH (1 << VTA_LOG_UOP_WIDTH)
	/! Weight data type width /
	#define VTA_WGT_WIDTH (1 << VTA_LOG_WGT_WIDTH)
	/! Input data type width /
	#define VTA_INP_WIDTH (1 << VTA_LOG_INP_WIDTH)
	/! Output data type width /
	#define VTA_OUT_WIDTH (1 << VTA_LOG_OUT_WIDTH)
	/! Accumulator data type width /
	#define VTA_ACC_WIDTH (1 << VTA_LOG_ACC_WIDTH)

	/! Batch size (corresponds to A in (A,B)x(B,C) mat mult)/
	#define VTA_BATCH (1 << VTA_LOG_BATCH)
	/! Blocking factor of inner most loop (corresponds to B in (A,B)x(B,C) mat mult) /
	#define VTA_BLOCK_IN (1 << VTA_LOG_BLOCK_IN)
	/! Blocking factor of the outer loop (corresponds to C in (A,B)x(B,C) mat mult) /
	#define VTA_BLOCK_OUT (1 << VTA_LOG_BLOCK_OUT)

	/! On-chip micro-op buffer size in B /
	#define VTA_UOP_BUFF_SIZE (1 << VTA_LOG_UOP_BUFF_SIZE)
	/! On-chip weight buffer size in B /
	#define VTA_WGT_BUFF_SIZE (1 << VTA_LOG_WGT_BUFF_SIZE)
	/! On-chip activation buffer size in B /
	#define VTA_INP_BUFF_SIZE (1 << VTA_LOG_INP_BUFF_SIZE)
	/! On-chip accumulator buffer size in B /
	#define VTA_ACC_BUFF_SIZE (1 << VTA_LOG_ACC_BUFF_SIZE)

	/! Input vector size in bits /
	#define VTA_INP_MATRIX_WIDTH (VTA_INP_WIDTH * VTA_BATCH * VTA_BLOCK_IN)
	/! Weight vector size in bits /
	#define VTA_WGT_MATRIX_WIDTH (VTA_WGT_WIDTH * VTA_BLOCK_OUT * VTA_BLOCK_IN)
	/! Accumulator vector size in bits /
	#define VTA_ACC_MATRIX_WIDTH (VTA_ACC_WIDTH * VTA_BATCH * VTA_BLOCK_OUT)
	/! Output vector size in bits /
	#define VTA_OUT_MATRIX_WIDTH (VTA_OUT_WIDTH * VTA_BATCH * VTA_BLOCK_OUT)

	/! Ratio between input matrix size and axi width /
	#define INP_MAT_AXI_RATIO (VTA_INP_MATRIX_WIDTH / VTA_BUS_WIDTH)
	/! Ratio between weight matrix size and axi width /
	#define WGT_MAT_AXI_RATIO (VTA_WGT_MATRIX_WIDTH / VTA_BUS_WIDTH)
	/! Ratio between accumulator matrix size and axi width /
	#define ACC_MAT_AXI_RATIO (VTA_ACC_MATRIX_WIDTH / VTA_BUS_WIDTH)
	/! Ratio between output matrix size and axi width /
	#define OUT_MAT_AXI_RATIO (VTA_OUT_MATRIX_WIDTH / VTA_BUS_WIDTH)

	/! Size of instruction buffer element in B /
	#define VTA_INS_ELEM_BYTES (VTA_INS_WIDTH / 8)
	/! Size of uop buffer element in B/
	#define VTA_UOP_ELEM_BYTES (VTA_UOP_WIDTH / 8)
	/! Size of activation buffer element in B/
	#define VTA_INP_ELEM_BYTES (VTA_INP_MATRIX_WIDTH / 8)
	/! Size of weight buffer element in B/
	#define VTA_WGT_ELEM_BYTES (VTA_WGT_MATRIX_WIDTH / 8)
	/! Size of accumulator buffer element in B/
	#define VTA_ACC_ELEM_BYTES (VTA_ACC_MATRIX_WIDTH / 8)
	/! Size of output buffer element in B/
	#define VTA_OUT_ELEM_BYTES (VTA_OUT_MATRIX_WIDTH / 8)

	/! On-chip micro-op buffer depth /
	#define VTA_UOP_BUFF_DEPTH (VTA_UOP_BUFF_SIZE / VTA_UOP_ELEM_BYTES)
	/! log2 of on-chip micro-op buffer depth /
	#define VTA_LOG_UOP_BUFF_DEPTH (VTA_LOG_UOP_BUFF_SIZE - VTA_LOG_UOP_WIDTH + 3)
	// ! \brief On-chip weight buffer depth
	#define VTA_WGT_BUFF_DEPTH (VTA_WGT_BUFF_SIZE / VTA_WGT_ELEM_BYTES)
	/! log2 of weight micro-op buffer depth /
	#define VTA_LOG_WGT_BUFF_DEPTH \
	(VTA_LOG_WGT_BUFF_SIZE - VTA_LOG_BLOCK_OUT - VTA_LOG_BLOCK_IN - VTA_LOG_WGT_WIDTH + 3)
	/! On-chip activation buffer depth /
	#define VTA_INP_BUFF_DEPTH (VTA_INP_BUFF_SIZE / VTA_INP_ELEM_BYTES)
	/! log2 of activation micro-op buffer depth /
	#define VTA_LOG_INP_BUFF_DEPTH \
	(VTA_LOG_INP_BUFF_SIZE - VTA_LOG_BATCH - VTA_LOG_BLOCK_IN - VTA_LOG_INP_WIDTH + 3)
	/! On-chip accumulator buffer depth /
	#define VTA_ACC_BUFF_DEPTH (VTA_ACC_BUFF_SIZE / VTA_ACC_ELEM_BYTES)
	/! log2 of on-chip accumulator buffer depth /
	#define VTA_LOG_ACC_BUFF_DEPTH \
	(VTA_LOG_ACC_BUFF_SIZE - VTA_LOG_BATCH - VTA_LOG_BLOCK_OUT - VTA_LOG_ACC_WIDTH + 3)

	/! Instruction opcode field bitwidth /
	#define VTA_OPCODE_BIT_WIDTH 3
	/! ALU opcode field bitwidth /
	#define VTA_ALU_OPCODE_BIT_WIDTH 2

	/! Opcode: load encoding /
	#define VTA_OPCODE_LOAD 0
	/! Opcode: store encoding /
	#define VTA_OPCODE_STORE 1
	/! Opcode: GEMM encoding /
	#define VTA_OPCODE_GEMM 2
	/! Opcode: finish encoding /
	#define VTA_OPCODE_FINISH 3
	/! Opcode: ALU encoding /
	#define VTA_OPCODE_ALU 4

	/! ALU opcode: unary min op /
	#define VTA_ALU_OPCODE_MIN 0
	/! ALU opcode: unary max op /
	#define VTA_ALU_OPCODE_MAX 1
	/! ALU opcode: binary add op /
	#define VTA_ALU_OPCODE_ADD 2
	/! ALU opcode: shift right by immediate op /
	#define VTA_ALU_OPCODE_SHR 3

	/! Memory type field bitwidth /
	#define VTA_MEMOP_ID_BIT_WIDTH 2
	/! Load/Store Instruction: DRAM address width/
	#define VTA_MEMOP_SRAM_ADDR_BIT_WIDTH 16
	/! Load/Store Instruction: DRAM address width/
	#define VTA_MEMOP_DRAM_ADDR_BIT_WIDTH 32
	/! Load/Store Instruction: transfer size width/
	#define VTA_MEMOP_SIZE_BIT_WIDTH 16
	/! Load/Store Instruction: stride size width/
	#define VTA_MEMOP_STRIDE_BIT_WIDTH 16
	/! Load/Store Instruction: padding width/
	#define VTA_MEMOP_PAD_BIT_WIDTH 4
	/! Load/Store Instruction: padding value encoding width/
	#define VTA_MEMOP_PAD_VAL_BIT_WIDTH 2
	/! GEMM/ALU Instruction: loop max iter bits /
	#define VTA_LOOP_ITER_WIDTH 14
	/! ALU Instruction: immediate bitwidth/
	#define VTA_ALUOP_IMM_BIT_WIDTH 16
	/! ALU Instruction: shift arg bitwidth/
	#define VTA_SHR_ARG_BIT_WIDTH (VTA_LOG_ACC_WIDTH)
	/! ALU Instruction: multiply arg bitwidth/
	#define VTA_MUL_ARG_BIT_WIDTH 8

	/! Mem ID constant: uop memory /
	#define VTA_MEM_ID_UOP 0
	/! Mem ID constant: weight memory /
	#define VTA_MEM_ID_WGT 1
	/! Mem ID constant: input memory /
	#define VTA_MEM_ID_INP 2
	/! Mem ID constant: accumulator/bias memory /
	#define VTA_MEM_ID_ACC 3
	/! Mem ID constant: output store buffer /
	#define VTA_MEM_ID_OUT 4

	/! GEMM Micro-op start position of the acc_idx field /
	#define VTA_UOP_GEM_0_0 0
	/! GEMM Micro-op end position of the acc_idx field /
	#define VTA_UOP_GEM_0_1 (VTA_UOP_GEM_0_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
	/! GEMM Micro-op start position of the inp_idx field /
	#define VTA_UOP_GEM_1_0 (VTA_UOP_GEM_0_1 + 1)
	/! GEMM Micro-op end position of the inp_idx field /
	#define VTA_UOP_GEM_1_1 (VTA_UOP_GEM_1_0 + VTA_LOG_INP_BUFF_DEPTH - 1)
	/! GEMM Micro-op start position of the wgt_idx field /
	#define VTA_UOP_GEM_2_0 (VTA_UOP_GEM_1_1 + 1)
	/! GEMM Micro-op end position of the wgt_idx field /
	#define VTA_UOP_GEM_2_1 (VTA_UOP_GEM_2_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)

	/! GEMM Micro-op start position of the acc_idx field /
	#define VTA_UOP_ALU_0_0 0
	/! GEMM Micro-op end position of the acc_idx field /
	#define VTA_UOP_ALU_0_1 (VTA_UOP_ALU_0_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
	/! GEMM Micro-op start position of the inp_idx field /
	#define VTA_UOP_ALU_1_0 (VTA_UOP_ALU_0_1 + 1)
	/! GEMM Micro-op end position of the inp_idx field /
	#define VTA_UOP_ALU_1_1 (VTA_UOP_ALU_1_0 + VTA_LOG_INP_BUFF_DEPTH - 1)

	/! \brief VTA generic instruction /
	typedef struct {
	/! \brief The instruction opcode /
	uint64_t opcode : VTA_OPCODE_BIT_WIDTH;
	/! \brief Unused in this instruction /
	uint64_t pop_prev_dep : 1;
	/! \brief Pop dependence token from GEMM stage /
	uint64_t pop_next_dep : 1;
	/! \brief Unused in this instruction /
	uint64_t push_prev_dep : 1;
	/! \brief Push dependence token to GEMM stage /
	uint64_t push_next_dep : 1;
	/! \brief Padding /
	uint64_t pad_0 : 64 - VTA_OPCODE_BIT_WIDTH - 4;
	/! \brief Padding /
	uint64_t pad_1 : 64;
	} VTAGenericInsn;

	/*! \brief VTA load/store instruction
	* Load/store instruction can describe a 2D strided access pattern
	* with padding, which can be useful to perform spatial padding
	* on the fly on a tensor on which to perform 2D convolution.
	* For instance if we try to load a 4x4 spatial tile from a 16x16
	* matrix with padding of size 1 on all dimensions:
	* y_size = 4, x_size = 4, x_stride = 16, y_pad_0 = 1, y_pad_1 = 1,
	* x_pad_0 = 1, x_pad_1 = 1.
	*/
	typedef struct {
	/! \brief The instruction opcode /
	uint64_t opcode : VTA_OPCODE_BIT_WIDTH;
	/! \brief Unused in this instruction /
	uint64_t pop_prev_dep : 1;
	/! \brief Pop dependence token from GEMM stage /
	uint64_t pop_next_dep : 1;
	/! \brief Unused in this instruction /
	uint64_t push_prev_dep : 1;
	/! \brief Push dependence token to GEMM stage /
	uint64_t push_next_dep : 1;
	/! \brief Source/destination SRAM for store/load instruction /
	uint64_t memory_type : VTA_MEMOP_ID_BIT_WIDTH;
	/! \brief SRAM base address (pointer to memory elem type) /
	uint64_t sram_base : VTA_MEMOP_SRAM_ADDR_BIT_WIDTH;
	/! \brief DRAM base address (pointer to memory elem type) /
	uint64_t dram_base : VTA_MEMOP_DRAM_ADDR_BIT_WIDTH;
	/! \brief 2D access pattern: y-size /
	uint64_t y_size : VTA_MEMOP_SIZE_BIT_WIDTH;
	/! \brief 2D access pattern: x-size (in terms of memory elements) /
	uint64_t x_size : VTA_MEMOP_SIZE_BIT_WIDTH;
	/! \brief 2D access pattern: x-stride (in terms of memory elements) /
	uint64_t x_stride : VTA_MEMOP_STRIDE_BIT_WIDTH;
	/! \brief 2D access pattern: start padding along y dimension /
	uint64_t y_pad_0 : VTA_MEMOP_PAD_BIT_WIDTH;
	/! \brief 2D access pattern: end padding along y dimension /
	uint64_t y_pad_1 : VTA_MEMOP_PAD_BIT_WIDTH;
	/! \brief 2D access pattern: start padding along x dimension /
	uint64_t x_pad_0 : VTA_MEMOP_PAD_BIT_WIDTH;
	/! \brief 2D access pattern: end padding along x dimension /
	uint64_t x_pad_1 : VTA_MEMOP_PAD_BIT_WIDTH;
	} VTAMemInsn;

	/*! \brief VTA GEMM instruction
	* GEMM instruction is implemented by executing a sequence of micro-operations
	* that is read in the local micro-op memory, delimited by \a uop_bgn and
	* \a uop_end. For improved storage-efficiency, the micro-operations can be
	* executed in a 2-level nested loop as follows:
	* \code{.cpp}
	* for (i = 0; i < iter_out; i++) {
	* for (j = 0; j < iter_in; j++) {
	* for (k = uop_bgn; k < uop_end; k++) {
	* // Read micro op
	* uop_T uop = uop_mem[k];
	* // Read in memory indices
	* acc_idx_T acc_idx = uop.dst_idx;
	* inp_idx_T inp_idx = uop.inp_idx;
	* wgt_idx_T wgt_idx = uop.wgt_idx;
	* // Update those indices with the following affine functions
	* acc_idx += iter_in * dst_factor_in + iter_out * dst_factor_out;
	* inp_idx += iter_in * src_factor_in + iter_out * src_factor_out;
	* wgt_idx += iter_in * wgt_factor_in + iter_out * wgt_factor_out;
	* // Perform GEMM operation
	* acc_mem[acc_idx] += dot(inp_mem[inp_idx], wgt[wgt_idx]);
	* }
	* }
	* }
	* \endcode
	*
	*/
	typedef struct {
	/! \brief The instruction opcode /
	uint64_t opcode : VTA_OPCODE_BIT_WIDTH;
	/! \brief Pop dependence token from load stage /
	uint64_t pop_prev_dep : 1;
	/! \brief Pop dependence token from store stage /
	uint64_t pop_next_dep : 1;
	/! \brief Push dependence token to load stage /
	uint64_t push_prev_dep : 1;
	/! \brief Push dependence token to store stage /
	uint64_t push_next_dep : 1;
	/! \brief Reset register /
	uint64_t reset_reg : 1;
	/! \brief Micro-op begin address /
	uint64_t uop_bgn : VTA_LOG_UOP_BUFF_DEPTH;
	/! \brief Micro-op end address /
	uint64_t uop_end : VTA_LOG_UOP_BUFF_DEPTH + 1;
	/! \brief Iterations in the outer uop execution loop /
	uint64_t iter_out : VTA_LOOP_ITER_WIDTH;
	/! \brief Iterations in the inner uop execution loop /
	uint64_t iter_in : VTA_LOOP_ITER_WIDTH;
	/! \brief Outer loop accumulator memory index factor /
	uint64_t dst_factor_out : VTA_LOG_ACC_BUFF_DEPTH;
	/! \brief Inner loop accumulator memory index factor /
	uint64_t dst_factor_in : VTA_LOG_ACC_BUFF_DEPTH;
	/! \brief Outer loop input memory index factor /
	uint64_t src_factor_out : VTA_LOG_INP_BUFF_DEPTH;
	/! \brief Inner loop input memory index factor /
	uint64_t src_factor_in : VTA_LOG_INP_BUFF_DEPTH;
	/! \brief Outer loop weight memory index factor /
	uint64_t wgt_factor_out : VTA_LOG_WGT_BUFF_DEPTH;
	/! \brief Inner loop weight memory index factor /
	uint64_t wgt_factor_in : VTA_LOG_WGT_BUFF_DEPTH;
	} VTAGemInsn;

	/*! \brief VTA ALU instruction
	* ALU instruction is implemented by executing a sequence of micro-operations
	* that is read in the local micro-op memory, delimited by \a uop_bgn and
	* \a uop_end. For improved storage-efficiency, the micro-operations can be
	* executed in a 2-level nested loop as follows:
	* \code{.cpp}
	* for (i = 0; i < iter_out; i++) {
	* for (j = 0; j < iter_in; j++) {
	* for (k = uop_bgn; k < uop_end; k++) {
	* // Read micro op
	* uop_T uop = uop_mem[k];
	* // Read in memory indices
	* acc_idx_T dst_idx = uop.dst_idx;
	* inp_idx_T src_idx = uop.inp_idx;
	* // Update those indices with the following affine functions
	* dst_idx += iter_in * dst_factor_in + iter_out * dst_factor_out;
	* src_idx += iter_in * src_factor_in + iter_out * src_factor_out;
	* // Perform ALU operation
	* if (use_imm) {
	* acc_mem[dst_idx] = alu_op(alu_opcode, acc_mem[dst_idx], imm);
	* } else {
	* acc_mem[dst_idx] = alu_op(alu_opcode, acc_mem[dst_idx], acc_mem[src_idx]);
	* }
	* }
	* }
	* }
	* \endcode
	*
	*/
	typedef struct {
	/! \brief The instruction opcode /
	uint64_t opcode : VTA_OPCODE_BIT_WIDTH;
	/! \brief Pop dependence token from load stage /
	uint64_t pop_prev_dep : 1;
	/! \brief Pop dependence token from store stage /
	uint64_t pop_next_dep : 1;
	/! \brief Push dependence token to load stage /
	uint64_t push_prev_dep : 1;
	/! \brief Push dependence token to store stage /
	uint64_t push_next_dep : 1;
	/! \brief Reset register /
	uint64_t reset_reg : 1;
	/! \brief Micro-op begin address /
	uint64_t uop_bgn : VTA_LOG_UOP_BUFF_DEPTH;
	/! \brief Micro-op end address /
	uint64_t uop_end : VTA_LOG_UOP_BUFF_DEPTH + 1;
	/! \brief Iterations in the outer uop execution loop /
	uint64_t iter_out : VTA_LOOP_ITER_WIDTH;
	/! \brief Iterations in the inner uop execution loop /
	uint64_t iter_in : VTA_LOOP_ITER_WIDTH;
	/! \brief Outer loop accumulator memory destination index factor /
	uint64_t dst_factor_out : VTA_LOG_ACC_BUFF_DEPTH;
	/! \brief Inner loop accumulator memory destination index factor /
	uint64_t dst_factor_in : VTA_LOG_ACC_BUFF_DEPTH;
	/! \brief Outer loop accumulator memory source index factor /
	uint64_t src_factor_out : VTA_LOG_INP_BUFF_DEPTH;
	/! \brief Inner loop accumulator memory source index factor /
	uint64_t src_factor_in : VTA_LOG_INP_BUFF_DEPTH;
	/! \brief ALU opcode /
	uint64_t alu_opcode : VTA_ALU_OPCODE_BIT_WIDTH;
	/! \brief Use immediate is true /
	uint64_t use_imm : 1;
	/! \brief Immediate value: allow negative value /
	int64_t imm : VTA_ALUOP_IMM_BIT_WIDTH;
	} VTAAluInsn;

	/! \brief VTA ALU instruction converter /
	union VTAInsn {
	/! \brief VTA generic instruction /
	VTAGenericInsn generic;
	/! \brief VTA load/store instruction /
	VTAMemInsn mem;
	/! \brief VTA GEMM instruction /
	VTAGemInsn gemm;
	/! \brief VTA ALU instruction /
	VTAAluInsn alu;
	};

	/! \brief VTA micro-op for GEMM/ALU instruction /
	typedef struct {
	/! \brief Destination index (indexes accum buffer) /
	uint32_t dst_idx : VTA_LOG_ACC_BUFF_DEPTH;
	/! \brief Source index (indexes input buffer for GEMM or accum buffer for ALU) /
	uint32_t src_idx : VTA_LOG_INP_BUFF_DEPTH;
	/! \brief Weight index (indexes weight buffer) /
	uint32_t wgt_idx : VTA_LOG_WGT_BUFF_DEPTH;
	} VTAUop;

	#ifdef __cplusplus
	}
	#endif
	#endif // VTA_HW_SPEC_H_