| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| #include <hexagon_types.h> |
| #include <hvx_hexagon_protos.h> |
| #include <inttypes.h> |
| #include <tvm/runtime/base.h> |
| #include <tvm/runtime/device_api.h> |
| |
| #include "conv2d.h" |
| |
| extern "C" int conv2d_packed_quant(void*, TVMFFIAny* args, int num_args, TVMFFIAny* out_val); |
| |
| namespace tvm { |
| namespace runtime { |
| namespace hexagon { |
| inline uint8_t* getElementPtr_int8(int block_out_y, int block_out_x, int block_out_c, int yi, |
| int xi, int ci, const DLTensor& block) { |
| auto block_ptr = |
| tvm::runtime::hexagon::conv_utils::nhwc_at(block, 0, block_out_y, block_out_x, block_out_c); |
| const int width_stride = 32; |
| const int height_stride = width_stride * 8; |
| auto block_offset = yi * height_stride + xi * width_stride + ci; |
| auto first_element_ptr = reinterpret_cast<uint8_t*>(block_ptr); |
| return first_element_ptr + block_offset; |
| } |
| |
| inline int8_t* getWgtPtr_int8(int out_i, int out_o, int h, int w, int i, int o, |
| const DLTensor& wgt_vtcm, int width) { |
| auto data = static_cast<intptr_t*>(wgt_vtcm.data); |
| auto chunk = data[out_i * wgt_vtcm.shape[3] + out_o]; |
| auto base_chunk_ptr = reinterpret_cast<int8_t*>(chunk); |
| auto wgt_chunk_offset = tvm::runtime::hexagon::conv_utils::hwio_to_sm_8b(width, h, w, i, o); |
| return base_chunk_ptr + wgt_chunk_offset; |
| } |
| |
| int32_t saturate_uint8(int32_t val) { return std::max(std::min(val, 255), 0); } |
| |
| int32_t saturate_int8(int32_t val) { return std::max(std::min(val, 127), -128); } |
| |
| /** |
| * @brief Compute the quantized convolution along with requantize with output quantization params to |
| * get uint8 outputs |
| * |
| * The quantized convolution is represented by the below equation |
| * out_scale(out_q - out_zp) = Σr,s,c(act_scale(act_q[n,h+r,w+s,c] - act_zp) * |
| * wgt_scale(wgt_q[r,s,c,o] - wgt_zp)) |
| * => out_q = Σr,s,c((act_q[n,h+r,w+s,c] - act_zp) * (wgt_q[r,s,c,o] - wgt_zp)) |
| * * (act_scale*wgt_scale/out_scale) + out_zp |
| * out_q = Σr,s,c((act_q[n,h+r,w+s,c] - act_zp) * (wgt_zp_q[r,s,c,o])) * |
| * (act_scale*wgt_scale/out_scale) + out_zp, where wgt_zp_q = (wgt_q[r,s,c,o] - wgt_zp) |
| * |
| * Assumptions/Limitations: |
| * - Strided convolution is not yet supported so the stride variables are unused |
| * |
| * @param cr_out blockized output tensor with zeros already filled in |
| * @param cr_act blockized activations |
| * @param cr_filt Chunkified weights as returned from output of prepare_hwio |
| * @param out_shape Original output shape of the tensor before blockization |
| * @param act_shape Original input shape |
| * @param filt_shape Original filter shape |
| * @param act_scale Quantization scale for activation |
| * @param act_zp Activations zero point |
| * @param wgt_scale Quantization scale for weights |
| * @param wgt_zp Weights zero point |
| * @param out_scale Quantization scale for output |
| * @param out_zp Output zero point |
| * @param fixed_final_scale Fixed point value of final_scale= (act_scale*wgt_scale/out_scale) |
| * @param scale_factor Scale factor for the fixed_final_scale |
| */ |
| void conv_layer_int8_hvx_whole(DLTensor& cr_out, const DLTensor& cr_act, // NOLINT(*) |
| const DLTensor& cr_filt, const DLTensor& out_shape, |
| const DLTensor& act_shape, const DLTensor& filt_shape, |
| float act_scale, int act_zp, float wgt_scale, int wgt_zp, |
| float out_scale, int out_zp, int fixed_final_scale, |
| int scale_factor) { |
| namespace conv_utils = tvm::runtime::hexagon::conv_utils; |
| int filt_height = filt_shape.shape[0]; |
| int filt_width = filt_shape.shape[1]; |
| int filt_idepth = filt_shape.shape[2]; |
| |
| int a_depth = cr_act.shape[3]; |
| |
| int o_height = cr_out.shape[1]; |
| int o_width = cr_out.shape[2]; |
| int o_depth = cr_out.shape[3]; |
| |
| int out_height = out_shape.shape[1]; |
| int out_width = out_shape.shape[2]; |
| |
| uint8_t act_zp_u8 = static_cast<uint8_t>(act_zp); |
| int8_t wgt_zp_i8 = static_cast<int8_t>(wgt_zp); |
| |
| HVX_Vector act_zp_vec = Q6_Vb_vsplat_R(act_zp_u8); |
| HVX_Vector wgt_zp_vec = Q6_Vb_vsplat_R(wgt_zp_i8); |
| HVX_VectorPair wgt_zp_vec_pair = Q6_Wh_vsxt_Vb(wgt_zp_vec); |
| |
| TVM_FFI_ICHECK_EQ(a_depth, cr_filt.shape[2]) << "input depth should match weights input channels"; |
| TVM_FFI_ICHECK_EQ(o_depth, cr_filt.shape[3]) |
| << "output depth should match the weights output channel"; |
| |
| uint32_t scale_u = static_cast<uint32_t>(fixed_final_scale); |
| HVX_Vector scale_vec = Q6_V_vsplat_R(scale_u); |
| uint32_t new_scale_factor = static_cast<uint32_t>(scale_factor - 16); |
| HVX_Vector out_zp_vec = Q6_V_vsplat_R(out_zp); |
| |
| auto computeOutVec = [&cr_act, &cr_filt, &act_zp_vec, &wgt_zp_vec_pair, &out_zp_vec, &scale_vec, |
| new_scale_factor, filt_height, filt_width, |
| filt_idepth](int out_h, int out_w, int out_c, int h, int w) -> HVX_Vector { |
| HVX_Vector out_vec = Q6_V_vzero(); |
| for (int fh = 0; fh < filt_height; ++fh) { |
| for (int fw = 0; fw < filt_width; ++fw) { |
| for (int c = 0; c < conv_utils::round_up(filt_idepth, 4); c += 4) { |
| int act_h = out_h * 8 + h + fh; |
| int act_ho = act_h / 8; |
| int act_hi = act_h % 8; |
| |
| int act_w = out_w * 8 + w + fw; |
| int act_wo = act_w / 8; |
| int act_wi = act_w % 8; |
| |
| int act_co = c / 32; |
| int act_ci = c % 32; |
| |
| uint8_t* act_ptr = |
| getElementPtr_int8(act_ho, act_wo, act_co, act_hi, act_wi, act_ci, cr_act); |
| |
| uint32_t four_act_elems = *reinterpret_cast<uint32_t*>(act_ptr); |
| HVX_Vector act_vec = Q6_V_vsplat_R(four_act_elems); |
| int8_t* wgt_ptr = getWgtPtr_int8(act_co, out_c, fh, fw, act_ci, 0, cr_filt, filt_width); |
| |
| HVX_Vector* wgt_vec_ptr = reinterpret_cast<HVX_Vector*>(wgt_ptr); |
| HVX_Vector wgt_vec = *wgt_vec_ptr; |
| |
| HVX_VectorPair act_vec_zp_diff = Q6_Wh_vsub_VubVub(act_vec, act_zp_vec); |
| HVX_VectorPair wgt_i16_vec_nodiff = Q6_Wh_vsxt_Vb(wgt_vec); |
| HVX_VectorPair wgt_i16_vec = Q6_Wh_vsub_WhWh_sat(wgt_i16_vec_nodiff, wgt_zp_vec_pair); |
| |
| out_vec = Q6_Vw_vdmpyacc_VwVhVh_sat(out_vec, Q6_V_lo_W(act_vec_zp_diff), |
| Q6_V_lo_W(wgt_i16_vec)); |
| out_vec = Q6_Vw_vdmpyacc_VwVhVh_sat(out_vec, Q6_V_hi_W(act_vec_zp_diff), |
| Q6_V_hi_W(wgt_i16_vec)); |
| } |
| } |
| } |
| HVX_Vector mul_vec = Q6_Vw_vmpye_VwVuh(out_vec, scale_vec); |
| HVX_Vector scaled_vec = Q6_Vw_vasr_VwR(mul_vec, new_scale_factor); |
| HVX_Vector sum_vec = Q6_Vw_vadd_VwVw(scaled_vec, out_zp_vec); |
| return sum_vec; |
| }; |
| |
| auto saturateAndStore = [&cr_out, &computeOutVec](int out_h, int out_w, int out_c, int h, int w) { |
| uint8_t* out_ptr = getElementPtr_int8(out_h, out_w, out_c, h, w, 0, cr_out); |
| HVX_Vector* out_vec_ptr = reinterpret_cast<HVX_Vector*>(out_ptr); |
| HVX_Vector out_vec1, out_vec2, out_vec3, out_vec4, out_vec; |
| out_vec1 = computeOutVec(out_h, out_w, out_c, h, w); |
| out_vec2 = computeOutVec(out_h, out_w, out_c, h, w + 1); |
| out_vec3 = computeOutVec(out_h, out_w, out_c, h, w + 2); |
| out_vec4 = computeOutVec(out_h, out_w, out_c, h, w + 3); |
| |
| HVX_Vector half_vec1 = Q6_Vh_vpack_VwVw_sat(out_vec2, out_vec1); |
| HVX_Vector half_vec2 = Q6_Vh_vpack_VwVw_sat(out_vec4, out_vec3); |
| out_vec = Q6_Vub_vpack_VhVh_sat(half_vec2, half_vec1); |
| *out_vec_ptr = out_vec; |
| }; |
| |
| for (int out_c = 0; out_c < o_depth; ++out_c) { |
| for (int out_h = 0; out_h < o_height; ++out_h) { |
| int max_y = std::min(8, out_height - out_h * 8); |
| for (int out_w = 0; out_w < o_width; ++out_w) { |
| int max_x = std::min(8, out_width - out_w * 8); |
| for (int h = 0; h < max_y; ++h) { |
| if (max_x == 8) { |
| for (int w = 0; w < max_x; w += 4) { |
| saturateAndStore(out_h, out_w, out_c, h, w); |
| } |
| } else { |
| int w = 0; |
| if (max_x >= 4) { |
| saturateAndStore(out_h, out_w, out_c, h, w); |
| w = 4; |
| } |
| uint8_t* out_ptr = getElementPtr_int8(out_h, out_w, out_c, h, w, 0, cr_out); |
| HVX_Vector* out_vec_ptr = reinterpret_cast<HVX_Vector*>(out_ptr); |
| HVX_Vector out_vec1, out_vec2, out_vec3, out_vec; |
| if (max_x % 4 == 1) { |
| out_vec1 = computeOutVec(out_h, out_w, out_c, h, w); |
| HVX_Vector half_vec = Q6_Vh_vpack_VwVw_sat(Q6_V_vzero(), out_vec1); |
| out_vec = Q6_Vub_vpack_VhVh_sat(Q6_V_vzero(), half_vec); |
| *out_vec_ptr = out_vec; |
| } else if (max_x % 4 == 2) { |
| out_vec1 = computeOutVec(out_h, out_w, out_c, h, w); |
| out_vec2 = computeOutVec(out_h, out_w, out_c, h, w + 1); |
| HVX_Vector half_vec = Q6_Vh_vpack_VwVw_sat(out_vec2, out_vec1); |
| out_vec = Q6_Vub_vpack_VhVh_sat(Q6_V_vzero(), half_vec); |
| *out_vec_ptr = out_vec; |
| } else if (max_x % 4 == 3) { |
| out_vec1 = computeOutVec(out_h, out_w, out_c, h, w); |
| out_vec2 = computeOutVec(out_h, out_w, out_c, h, w + 1); |
| out_vec3 = computeOutVec(out_h, out_w, out_c, h, w + 2); |
| HVX_Vector half_vec1 = Q6_Vh_vpack_VwVw_sat(out_vec2, out_vec1); |
| HVX_Vector half_vec2 = Q6_Vh_vpack_VwVw_sat(Q6_V_vzero(), out_vec3); |
| out_vec = Q6_Vub_vpack_VhVh_sat(half_vec2, half_vec1); |
| *out_vec_ptr = out_vec; |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| } // namespace hexagon |
| } // namespace runtime |
| } // namespace tvm |
| |
| int conv2d_packed_quant(void*, TVMFFIAny* args, int num_args, TVMFFIAny* out_val) { |
| namespace conv_utils = tvm::runtime::hexagon::conv_utils; |
| TVM_FFI_ICHECK_EQ(num_args, 13) << "Unexpected number of arguments"; |
| TVM_FFI_ICHECK_EQ(args[0].type_index, kTVMFFIDLTensorPtr) |
| << "First argument is expected to be the input tensor"; // Input activations |
| TVM_FFI_ICHECK_EQ(args[1].type_index, kTVMFFIDLTensorPtr) |
| << "Second argument is expected to be the weights tensor"; // Weights |
| TVM_FFI_ICHECK_EQ(args[2].type_index, kTVMFFIFloat) |
| << "Third argument is expected to be the activation scale"; |
| TVM_FFI_ICHECK_EQ(args[3].type_index, kTVMFFIInt) |
| << "Fourth argument is expected to be the activation zero point"; |
| TVM_FFI_ICHECK_EQ(args[4].type_index, kTVMFFIFloat) |
| << "Fifth argument is expected to be the weight scale"; |
| TVM_FFI_ICHECK_EQ(args[5].type_index, kTVMFFIInt) |
| << "Sixth argument is expected to be the weight zero point"; |
| TVM_FFI_ICHECK_EQ(args[6].type_index, kTVMFFIFloat) |
| << "Seventh argument is expected to be the output scale"; |
| TVM_FFI_ICHECK_EQ(args[7].type_index, kTVMFFIInt) |
| << "Eigth argument is expected to be the output zero point"; |
| TVM_FFI_ICHECK_EQ(args[8].type_index, kTVMFFIInt) |
| << "Nineth argument is expected to be the stride_h"; // stride_h |
| TVM_FFI_ICHECK_EQ(args[9].type_index, kTVMFFIInt) |
| << "Tenth argument is expected to be the stride_w"; // stride_w |
| TVM_FFI_ICHECK_EQ(args[10].type_index, kTVMFFIInt) |
| << "Eleventh argument is expected to be fixed final scale"; |
| TVM_FFI_ICHECK_EQ(args[11].type_index, kTVMFFIInt) |
| << "Twelfth argument is expected to be scale factor"; |
| TVM_FFI_ICHECK_EQ(args[12].type_index, kTVMFFIDLTensorPtr) |
| << "Thirteenth argument is expected to be the output tensor"; // output |
| |
| auto* act_flat = static_cast<DLTensor*>(args[0].v_ptr); |
| auto* wgt_flat = static_cast<DLTensor*>(args[1].v_ptr); |
| auto* out_flat = static_cast<DLTensor*>(args[12].v_ptr); |
| |
| // Temporary assertion until multiple batches are supported |
| TVM_FFI_ICHECK_EQ(act_flat->shape[0], 1) << "Input batch size more than 1 is not supported yet"; |
| |
| // Temporary assertion until multiple batches are supported |
| TVM_FFI_ICHECK_EQ(out_flat->shape[0], 1) << "Output batch size more than 1 is not supported yet"; |
| |
| float act_scale = args[2].v_float64; |
| int act_zp = args[3].v_int64; |
| LOG_INFO << "act_scale: " << act_scale << ", act_zp: " << act_zp; |
| |
| float wgt_scale = args[4].v_float64; |
| int wgt_zp = args[5].v_int64; |
| LOG_INFO << "wgt_scale: " << wgt_scale << ", wgt_zp: " << wgt_zp; |
| |
| float out_scale = args[6].v_float64; |
| int out_zp = args[7].v_int64; |
| LOG_INFO << "out_scale: " << out_scale << ", out_zp: " << out_zp; |
| |
| int stride_h = args[8].v_int64; |
| int stride_w = args[9].v_int64; |
| LOG_INFO << "stride_h: " << stride_h << ", stride_w: " << stride_w; |
| |
| int fixed_final_scale = args[10].v_int64; |
| int scale_factor = args[11].v_int64; |
| LOG_INFO << "fixed_final_scale: " << fixed_final_scale << ", scale_factor: " << scale_factor; |
| |
| auto* device_api = tvm::runtime::DeviceAPI::Get(conv_utils::hexagon_device, false); |
| TVM_FFI_ICHECK(device_api != nullptr); |
| tvm::ffi::String vtcm_scope = "global.vtcm"; |
| |
| auto act_vtcm = |
| conv_utils::prepare_nhwc<uint8_t, 8, 8, 32>(device_api, act_flat, /*copy_data=*/true); |
| |
| int num_wgt_chunks = conv_utils::calculate_num_weight_chunks( |
| wgt_flat->shape, /* chunk_height */ wgt_flat->shape[0], |
| /* chunk_width */ wgt_flat->shape[1], /* chunk_in_channel */ 32, /* chunk_out_channel */ 32); |
| auto wgt_ptr_table = |
| reinterpret_cast<void**>(__builtin_alloca(num_wgt_chunks * sizeof(uintptr_t))); |
| |
| auto wgt_vtcm = |
| conv_utils::prepare_hwio_8b(device_api, wgt_flat, num_wgt_chunks, wgt_ptr_table, wgt_zp); |
| |
| auto out_vtcm = |
| conv_utils::prepare_nhwc<uint8_t, 8, 8, 32>(device_api, out_flat, /*copy_data=*/false); |
| |
| auto act_shape = conv_utils::SDLTensor<4>(nullptr, act_flat->dtype, nullptr, act_flat->shape); |
| auto filt_shape = conv_utils::SDLTensor<4>(nullptr, wgt_flat->dtype, nullptr, wgt_flat->shape); |
| auto out_shape = conv_utils::SDLTensor<4>(nullptr, out_flat->dtype, nullptr, out_flat->shape); |
| |
| tvm::runtime::hexagon::conv_layer_int8_hvx_whole( |
| out_vtcm, act_vtcm, wgt_vtcm, out_shape, act_shape, filt_shape, act_scale, act_zp, wgt_scale, |
| wgt_zp, out_scale, out_zp, fixed_final_scale, scale_factor); |
| |
| conv_utils::deblockize_hwc<uint8_t, 8, 8, 32>(out_flat->data, out_vtcm.data, out_flat->shape[1], |
| out_flat->shape[2], out_flat->shape[3]); |
| |
| conv_utils::release(device_api, out_vtcm); |
| conv_utils::release(device_api, wgt_vtcm); |
| conv_utils::release(device_api, act_vtcm); |
| |
| return 0; |
| } |