blob: 73809d639758a00de1356784c982ed413dcc1ef2 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#pragma OPENCL EXTENSION cl_intel_channels: enable
#include "vta.h"
__attribute__((reqd_work_group_size(1,1,1)))
__kernel void vta_core(
unsigned int insn_count, unsigned int insns_offset, __global insn_T* restrict insns,
__global uop_T* restrict uops, __global acc_T* restrict biases, __global inp_T* restrict inputs,
__global wgt_T* restrict weights, __global out_T* restrict outputs) {
/* Local Memories */
uop_T uop_mem[VTA_UOP_BUFF_DEPTH];
inp_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH][VTA_BLOCK_IN];
wgt_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT][VTA_BLOCK_IN];
acc_T acc_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH][VTA_BLOCK_OUT] __attribute__((memory, numbanks(1)));
out_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH][VTA_BLOCK_OUT];
for (int pc = 0; pc < insn_count; pc++) {
insn_T insn = insns[insns_offset + pc];
/* General Instruction Fields */
unsigned char insn_opcode = BITS(insn.w[0], OPCODE_OFFSET, OPCODE_WIDTH );
unsigned char insn_dep_flags = BITS(insn.w[0], DEP_FLAGS_OFFSET, DEP_FLAGS_WIDTH );
/* LOAD/STORE Instruction Fields */
unsigned char insn_memory_type = BITS(insn.w[0], MEMORY_TYPE_OFFSET, MEMORY_TYPE_WIDTH );
unsigned int insn_sram_base = BITS(insn.w[0], SRAM_BASE_OFFSET, SRAM_BASE_WIDTH );
unsigned int insn_dram_base = BITS(insn.w[0], DRAM_BASE_OFFSET, DRAM_BASE_WIDTH );
unsigned int insn_y_size = BITS(insn.w[1], Y_SIZE_OFFSET, Y_SIZE_WIDTH );
unsigned int insn_x_size = BITS(insn.w[1], X_SIZE_OFFSET, X_SIZE_WIDTH );
unsigned int insn_x_stride = BITS(insn.w[1], X_STRIDE_OFFSET, X_STRIDE_WIDTH );
unsigned int insn_y_pad_0 = BITS(insn.w[1], Y_PAD_0_OFFSET, Y_PAD_0_WIDTH );
unsigned int insn_y_pad_1 = BITS(insn.w[1], Y_PAD_1_OFFSET, Y_PAD_1_WIDTH );
unsigned int insn_x_pad_0 = BITS(insn.w[1], X_PAD_0_OFFSET, X_PAD_0_WIDTH );
unsigned int insn_x_pad_1 = BITS(insn.w[1], X_PAD_1_OFFSET, X_PAD_1_WIDTH );
/* GEMM/ALU Instruction Fields */
unsigned int insn_reset = BITS(insn.w[0], RESET_OFFSET, RESET_WIDTH );
unsigned int insn_uop_bgn = BITS(insn.w[0], UOP_BGN_OFFSET, UOP_BGN_WIDTH );
unsigned int insn_uop_end = BITS(insn.w[0], UOP_END_OFFSET, UOP_END_WIDTH );
unsigned int insn_iter_out = BITS(insn.w[0], ITER_OUT_OFFSET, ITER_OUT_WIDTH );
unsigned int insn_iter_in = BITS(insn.w[0], ITER_IN_OFFSET, ITER_IN_WIDTH );
unsigned int insn_dst_fac_out = BITS(insn.w[1], DST_FAC_OUT_OFFSET, DST_FAC_OUT_WIDTH );
unsigned int insn_dst_fac_in = BITS(insn.w[1], DST_FAC_IN_OFFSET, DST_FAC_IN_WIDTH );
unsigned int insn_gsrc_fac_out = BITS(insn.w[1], GSRC_FAC_OUT_OFFSET,GSRC_FAC_OUT_WIDTH );
unsigned int insn_gsrc_fac_in = BITS(insn.w[1], GSRC_FAC_IN_OFFSET, GSRC_FAC_IN_WIDTH );
unsigned int insn_asrc_fac_out = BITS(insn.w[1], ASRC_FAC_OUT_OFFSET,ASRC_FAC_OUT_WIDTH );
unsigned int insn_asrc_fac_in = BITS(insn.w[1], ASRC_FAC_IN_OFFSET, ASRC_FAC_IN_WIDTH );
unsigned int insn_wgt_fac_out = BITS(insn.w[1], WGT_FAC_OUT_OFFSET, WGT_FAC_OUT_WIDTH );
unsigned int insn_wgt_fac_in = BITS(insn.w[1], WGT_FAC_IN_OFFSET, WGT_FAC_IN_WIDTH );
unsigned char insn_alu_opcode = BITS(insn.w[1], ALU_OPCODE_OFFSET, ALU_OPCODE_WIDTH );
unsigned char insn_use_imm = BITS(insn.w[1], USE_IMM_OFFSET, USE_IMM_WIDTH );
short insn_imm = BITS(insn.w[1], IMM_OFFSET, IMM_WIDTH );
if (insn_opcode == VTA_OPCODE_FINISH) {
break;
} else if (insn_opcode == VTA_OPCODE_LOAD) {
if (insn_memory_type == VTA_MEM_ID_INP) {
unsigned int x_width = (insn_x_pad_0 + insn_x_size + insn_x_pad_1);
unsigned int y_width = (insn_y_pad_0 + insn_y_size + insn_y_pad_1);
for (unsigned y = 0; y < y_width; y++) {
unsigned int sram_offset_1 = insn_sram_base + y * x_width;
unsigned int dram_offset_1 = insn_dram_base + (y - insn_y_pad_0) * insn_x_stride;
for (unsigned x = 0; x < x_width; x++) {
unsigned int sram_offset_2 = sram_offset_1 + x;
unsigned int dram_offset_2 = dram_offset_1 + (x - insn_x_pad_0);
unsigned int dram_idx = dram_offset_2 * VTA_BATCH * VTA_BLOCK_IN;
for (unsigned i = 0; i < VTA_BATCH; i++) {
for (unsigned j = 0; j < VTA_BLOCK_IN; j++) {
if (x < insn_x_pad_0 || x >= (insn_x_pad_0 + insn_x_size) || y < insn_y_pad_0 ||
y >= (insn_y_pad_0 + insn_y_size))
inp_mem[sram_offset_2][i][j] = 0;
else {
inp_mem[sram_offset_2][i][j] = inputs[dram_idx + i * VTA_BLOCK_IN + j];
}
}
}
}
}
} else if (insn_memory_type == VTA_MEM_ID_WGT) {
for (unsigned y = 0; y < insn_y_size; y++) {
unsigned int sram_offset_1 = insn_sram_base + y * insn_x_size;
unsigned int dram_offset_1 = insn_dram_base + y * insn_x_stride;
for (unsigned x = 0; x < insn_x_size; x++) {
unsigned int sram_offset_2 = sram_offset_1 + x;
unsigned int dram_offset_2 = dram_offset_1 + x;
unsigned int dram_idx = dram_offset_2 * VTA_BLOCK_OUT * VTA_BLOCK_IN;
for (unsigned i = 0; i < VTA_BLOCK_OUT; i++) {
for (unsigned j = 0; j < VTA_BLOCK_IN; j++) {
wgt_mem[sram_offset_2][i][j] = weights[dram_idx + i * VTA_BLOCK_IN + j];
}
}
}
}
} else if (insn_memory_type == VTA_MEM_ID_ACC) {
for (unsigned y = 0; y < insn_y_size; y++) {
unsigned int sram_offset_1 = insn_sram_base + y * insn_x_size;
unsigned int dram_offset_1 = insn_dram_base + y * insn_x_stride;
for (unsigned x = 0; x < insn_x_size; x++) {
unsigned int sram_offset_2 = sram_offset_1 + x;
unsigned int dram_offset_2 = dram_offset_1 + x;
unsigned int dram_idx = dram_offset_2 * VTA_BATCH * VTA_BLOCK_OUT;
for (unsigned i = 0; i < VTA_BATCH; i++) {
for (unsigned j = 0; j < VTA_BLOCK_OUT; j++) {
acc_mem[sram_offset_2][i][j] = biases[dram_idx + i * VTA_BLOCK_OUT + j];
}
}
}
}
} else if (insn_memory_type == VTA_MEM_ID_UOP) {
for (unsigned x = 0; x < insn_x_size; x++) {
uop_mem[insn_sram_base + x] = uops[insn_dram_base + x];
}
} else if (insn_memory_type == VTA_MEM_ID_ACC_8BIT) {
for (unsigned y = 0; y < insn_y_size; y++) {
unsigned int sram_offset_1 = insn_sram_base + y * insn_x_size;
unsigned int dram_offset_1 = insn_dram_base + y * insn_x_stride;
for (unsigned x = 0; x < insn_x_size; x++) {
unsigned int sram_offset_2 = sram_offset_1 + x;
unsigned int dram_offset_2 = dram_offset_1 + x;
unsigned int dram_idx = dram_offset_2 * VTA_BATCH * VTA_BLOCK_OUT;
for (unsigned i = 0; i < VTA_BATCH; i++) {
for (unsigned j = 0; j < VTA_BLOCK_OUT; j++) {
acc_mem[sram_offset_2][i][j] = inputs[dram_idx + i * VTA_BLOCK_OUT + j];
}
}
}
}
}
} else if (insn_opcode == VTA_OPCODE_STORE) {
for (unsigned y = 0; y < insn_y_size; y++) {
unsigned int sram_offset_1 = insn_sram_base + y * insn_x_size;
unsigned int dram_offset_1 = insn_dram_base + y * insn_x_stride;
for (unsigned x = 0; x < insn_x_size; x++) {
unsigned int sram_offset_2 = sram_offset_1 + x;
unsigned int dram_offset_2 = dram_offset_1 + x;
unsigned int dram_idx = dram_offset_2 * VTA_BATCH * VTA_BLOCK_OUT;
for (unsigned i = 0; i < VTA_BATCH; i++) {
for (unsigned j = 0; j < VTA_BLOCK_OUT; j++) {
outputs[dram_idx + i * VTA_BLOCK_OUT + j] = out_mem[sram_offset_2][i][j];
}
}
}
}
} else if (insn_opcode == VTA_OPCODE_GEMM) {
/* Loop offset */
unsigned int dst_offset_out = 0;
unsigned int src_offset_out = 0;
unsigned int wgt_offset_out = 0;
/* Outer Loop */
for (unsigned int it_out = 0; it_out < insn_iter_out; it_out++) {
unsigned int dst_offset_in = dst_offset_out;
unsigned int src_offset_in = src_offset_out;
unsigned int wgt_offset_in = wgt_offset_out;
/* Inner Loop */
for (unsigned int it_in = 0; it_in < insn_iter_in; it_in++) {
for (unsigned int upc = insn_uop_bgn; upc < insn_uop_end; upc++) {
uop_T uop = uop_mem[upc];
unsigned int uop_dst_idx = BITS(uop, UOP_DST_OFFSET, UOP_DST_WIDTH);
unsigned int uop_src_idx = BITS(uop, UOP_SRC_OFFSET, UOP_SRC_WIDTH);
unsigned int uop_wgt_idx = BITS(uop, UOP_WGT_OFFSET, UOP_WGT_WIDTH);
/* Decode indices */
unsigned int dst_idx = uop_dst_idx + dst_offset_in;
unsigned int src_idx = uop_src_idx + src_offset_in;
unsigned int wgt_idx = uop_wgt_idx + wgt_offset_in;
/* Inner GEMM loop */
for (int b = 0; b < VTA_BATCH; b++) {
#pragma unroll
for (int oc = 0; oc < VTA_BLOCK_OUT; oc++) {
/* Initialize the accumulator values */
acc_T accum = acc_mem[dst_idx][b][oc];
sum_T sum = 0;
/* Inner matrix multiplication loop (input channel/feature) */
#pragma unroll
for (int ic = 0; ic < VTA_BLOCK_IN; ic++) {
wgt_T w_elem = wgt_mem[wgt_idx][oc][ic];
inp_T i_elem = inp_mem[src_idx][b][ic];
mul_T prod_dsp = i_elem * w_elem;
sum += (sum_T)prod_dsp;
}
/* WORKAROUND FOR A SIGNEDNESS BUG IN INTEL FPGA SDK FOR OPENCL */
#if VTA_BLOCK_IN == 16
if ( sum >= 0x80000 ) sum -= 0x100000;
#elif VTA_BLOCK_IN == 32
if ( sum >= 0x100000 ) sum -= 0x200000;
#else
#error Untested Condition
#endif
/* END WORKAROUND */
/* Update sum */
accum += sum;
acc_mem[dst_idx][b][oc] = (acc_T)(insn_reset ? 0 : accum);
out_mem[dst_idx][b][oc] = (out_T)(accum & 0xFF);
}
}
}
dst_offset_in += insn_dst_fac_in;
src_offset_in += insn_gsrc_fac_in;
wgt_offset_in += insn_wgt_fac_in;
}
dst_offset_out += insn_dst_fac_out;
src_offset_out += insn_gsrc_fac_out;
wgt_offset_out += insn_wgt_fac_out;
}
} else if (insn_opcode == VTA_OPCODE_ALU) {
/* Loop offset */
unsigned int dst_offset_out = 0;
unsigned int src_offset_out = 0;
/* Outer Loop */
for (unsigned int it_out = 0; it_out < insn_iter_out; it_out++) {
unsigned int dst_offset_in = dst_offset_out;
unsigned int src_offset_in = src_offset_out;
/* Inner Loop */
for (unsigned int it_in = 0; it_in < insn_iter_in; it_in++) {
for (unsigned int upc = insn_uop_bgn; upc < insn_uop_end; upc++) {
uop_T uop = uop_mem[upc];
unsigned int uop_dst_idx = BITS(uop, UOP_DST_OFFSET, UOP_DST_WIDTH);
unsigned int uop_src_idx = BITS(uop, UOP_SRC_OFFSET, UOP_SRC_WIDTH);
/* Decode indices */
unsigned int dst_idx = uop_dst_idx + dst_offset_in;
unsigned int src_idx = uop_src_idx + src_offset_in;
#pragma unroll
for (int i = 0; i < VTA_BATCH; i++) {
#pragma unroll
for (int b = 0; b < VTA_BLOCK_OUT; b++) {
/* Read in operands */
acc_T src_0 = acc_mem[dst_idx][i][b];
acc_T src_1 = (acc_T)(insn_use_imm ? insn_imm : acc_mem[src_idx][i][b]);
if (insn_alu_opcode == VTA_ALU_OPCODE_MIN ||
insn_alu_opcode == VTA_ALU_OPCODE_MAX) {
/* Compute Min/Max */
acc_T mix_val = src_0 < src_1
? (insn_alu_opcode == VTA_ALU_OPCODE_MIN ? src_0 : src_1)
: (insn_alu_opcode == VTA_ALU_OPCODE_MIN ? src_1 : src_0);
acc_mem[dst_idx][i][b] = mix_val;
out_mem[dst_idx][i][b] = (out_T)(mix_val & 0xFF);
} else if (insn_alu_opcode == VTA_ALU_OPCODE_ADD) {
/* Compute Sum */
acc_T add_val = src_0 + src_1;
acc_mem[dst_idx][i][b] = add_val;
out_mem[dst_idx][i][b] = (out_T)(add_val & 0xFF);
} else if (insn_alu_opcode == VTA_ALU_OPCODE_SHR) {
/* Compute Shift */
acc_T shr_val;
if (src_1 >= 0)
shr_val = src_0 >> src_1;
else
shr_val = src_0 << (-src_1);
acc_mem[dst_idx][i][b] = shr_val;
out_mem[dst_idx][i][b] = (out_T)(shr_val & 0xFF);
} else if (insn_alu_opcode == VTA_ALU_OPCODE_MUL) {
/* Compute Multiplication */
acc_T mul_val = src_0 * src_1;
acc_mem[dst_idx][i][b] = mul_val;
out_mem[dst_idx][i][b] = (out_T)(mul_val & 0xFF);
}
}
}
}
dst_offset_in += insn_dst_fac_in;
src_offset_in += insn_asrc_fac_in;
}
dst_offset_out += insn_dst_fac_out;
src_offset_out += insn_asrc_fac_out;
}
}
}
}