src/runtime/hexagon/ops/conv_utils.cc - tvm - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 #include <type_traits>

 #include "conv2d.h"

 namespace tvm {
 namespace runtime {
 namespace hexagon {
 namespace conv_utils {

 /**
  * @brief Convert the layout of weights from flat to "chunked". The term chunked is explained below:
  *
  * Weights are packed into the below mentioned layout (notation similar to index map):
  * Since weights cannot be exactly represented into a index map notation, the
  * base split up is mentioned below with a few deviations
  *
  * lambda h, w, i, o: o//32, i//32, h, w, (i%32)//4, o%32, i%4
  *
  * The deviations are:
  *  - w is actually stored in the right to left order, as in 3,2,1,0 instead of 0,1,2,3
  *
  * @param out_ptr Base pointer table to be filled with the list of pointers to the first addresses
  * of the "chunked" weights
  * @param out_ptr_size The number of chunks
  * @param out Pointer to pre-allocated output memory
  * @param inp Pointer to flat input data
  * @param height
  * @param width
  * @param idepth
  * @param odepth
  */
 void chunkify_hwio_8b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height, int width,
                       int idepth, int odepth, int wgt_zp) {
   auto inp_data = static_cast<int8_t*>(inp);
   auto out_data = static_cast<int8_t*>(out);
   const int stride_i = odepth;
   const int stride_x = stride_i * idepth;
   const int stride_y = stride_x * width;

   for (int ci = 0; ci < idepth; ci += 32) {
     for (int co = 0; co < odepth; co += 32) {
       int max_i = std::min(32, idepth - ci);
       int max_o = std::min(32, odepth - co);

       auto chunk = out_data;
       for (int y = 0; y < height; ++y) {
         for (int x = width - 1; x >= 0; --x) {
           for (int i = 0; i < max_i; ++i) {
             for (int o = 0; o < max_o; ++o) {
               chunk[hwio_to_sm_8b(width, y, x, i, o)] =
                   inp_data[y * stride_y + x * stride_x + (ci + i) * stride_i + (co + o)];
             }
             for (int o = max_o; o < 32; ++o) chunk[hwio_to_sm_8b(width, y, x, i, o)] = wgt_zp;
           }
           for (int i = max_i; i < 32; ++i)
             for (int o = 0; o < 32; ++o) chunk[hwio_to_sm_8b(width, y, x, i, o)] = wgt_zp;
         }
       }

       *out_ptr++ = chunk;
       out_data += height * width * 32 * 32;
       out_ptr_size--;
       assert(out_ptr_size >= 0);
     }
   }
 }

 /**
  * @brief Convert the layout of weights from flat to "chunked". The term chunked is explained below:
  *
  * Weights are packed into the below mentioned layout (notation similar to index map):
  * Since weights cannot be exactly represented into a index map notation, the
  * base split up is mentioned below with a few gotchas
  *
  * lambda h, w, i, o: h//8, w//4, o//32, i//32, h%8, w%4, (i%32)//2, o%32, i%2
  *
  * The gotchas are:
  *  - (w%4) is actually stored in the right to left order, as in 3,2,1,0 instead of 0,1,2,3
  *  - The h%8 and (w%4) dimensions are not padded up, leading to chunks of different sizes
  *    (thereby the name "chunked" instead of packed)
  *  - The thinnest chunk of width is stored first. For example, if a kernel is 5x5, the first
  *    chunk along the width has size 1 (representing index 0) and then next one has size 4
  *    representing indices (1,2,3,4)
  *
  * @param out_ptr Base pointer table to be filled with the list of pointers to the first addresses
  * of the "chunked" weights
  * @param out_ptr_size The number of chunks
  * @param out Pointer to pre-allocated output memory
  * @param inp Pointer to flat input data
  * @param height
  * @param width
  * @param idepth
  * @param odepth
  */
 void chunkify_hwio_16b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height,
                        int width, int idepth, int odepth) {
   auto inp_data = static_cast<uint16_t*>(inp);
   auto out_data = static_cast<uint16_t*>(out);
   const int stride_i = odepth;
   const int stride_x = stride_i * idepth;
   const int stride_y = stride_x * width;

   for (int cy = 0; cy < height; cy += 8) {
     // In the chunkified tensor, the chunks are ordered in increasing
     // x order, but they start from the thin one.
     for (int cx = width - round_up(width, 4); cx < width; cx += 4) {
       int cx0 = std::max(0, cx);
       for (int ci = 0; ci < idepth; ci += 32) {
         for (int co = 0; co < odepth; co += 32) {
           int max_y = std::min(8, height - cy);
           int max_x = std::min(4, cx + 4 - cx0);
           int max_i = std::min(32, idepth - ci);
           int max_o = std::min(32, odepth - co);

           auto chunk = out_data;
           for (int y = 0; y < max_y; ++y) {
             for (int x = max_x - 1; x >= 0; --x) {
               for (int i = 0; i < max_i; ++i) {
                 for (int o = 0; o < max_o; ++o) {
                   chunk[hwio_to_sm_16b(max_x, y, x, i, o)] =
                       inp_data[(cy + y) * stride_y + (cx0 + x) * stride_x + (ci + i) * stride_i +
                                (co + o)];
                 }
                 for (int o = max_o; o < 32; ++o) chunk[hwio_to_sm_16b(max_x, y, x, i, o)] = 0;
               }
               for (int i = max_i; i < 32; ++i)
                 for (int o = 0; o < 32; ++o) chunk[hwio_to_sm_16b(max_x, y, x, i, o)] = 0;
             }
           }

           *out_ptr++ = chunk;
           out_data += max_y * max_x * 32 * 32;
           out_ptr_size--;
           assert(out_ptr_size >= 0);
         }
       }
     }
   }
 }

 std::tuple<int, int, int, int> getHWIO(const DLTensor* hwio_flat) {
   int h = hwio_flat->shape[0];
   int w = hwio_flat->shape[1];
   int i = round_up(hwio_flat->shape[2], 32);
   int o = round_up(hwio_flat->shape[3], 32);
   return std::make_tuple(h, w, i, o);
 }

 SDLTensor<4> prepare_hwio_8b(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
                              int num_chunks, void** ptr_table, int wgt_zp) {
   tvm::ffi::String vtcm_scope = "global.vtcm";

   auto [h, w, i, o] = getHWIO(hwio_flat);
   int64_t shape_1d[] = {h * w * i * o};
   void* hwio_vtcm =
       device_api->AllocDataSpace(hexagon_device, 1, shape_1d, hwio_flat->dtype, vtcm_scope);

   chunkify_hwio_8b(ptr_table, num_chunks, hwio_vtcm, hwio_flat->data, hwio_flat->shape[0],
                    hwio_flat->shape[1], hwio_flat->shape[2], hwio_flat->shape[3], wgt_zp);

   return SDLTensor<4>(ptr_table, hwio_flat->dtype, hwio_vtcm, {1, 1, i / 32, o / 32});
 }

 SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
                           int num_chunks, void** ptr_table) {
   tvm::ffi::String vtcm_scope = "global.vtcm";

   // Allocate one block for filter data. We will need to create our own
   // pointer table. The reason is that filter chunks cannot be padded
   // height- or width-wise, so filter chunks may have different sizes.
   // A filter chunk is a block of size HxWx32x32, where H, W are at most
   // height and width of a block respectively.
   auto [h, w, i, o] = getHWIO(hwio_flat);
   int64_t shape_1d[] = {h * w * i * o};
   void* hwio_vtcm =
       device_api->AllocDataSpace(hexagon_device, 1, shape_1d, hwio_flat->dtype, vtcm_scope);

   chunkify_hwio_16b(ptr_table, num_chunks, hwio_vtcm, hwio_flat->data, hwio_flat->shape[0],
                     hwio_flat->shape[1], hwio_flat->shape[2], hwio_flat->shape[3]);

   return SDLTensor<4>(ptr_table, hwio_flat->dtype, hwio_vtcm,
                       {round_up(h, 8) / 8, round_up(w, 4) / 4, i / 32, o / 32});
 }

 int calculate_num_weight_chunks(int64_t* shape_hwio, int chunk_height, int chunk_width,
                                 int chunk_in_channel, int chunk_out_channel) {
   // Define slower roundup that doesn't assume multiplier 'p' to be power of 2
   auto roundup = [](int v, int p) { return (v + p - 1) - ((v + p - 1) % p); };
   int h = roundup(shape_hwio[0], chunk_height);
   int w = roundup(shape_hwio[1], chunk_width);
   int i = roundup(shape_hwio[2], chunk_in_channel);
   int o = roundup(shape_hwio[3], chunk_out_channel);

   return (h * w * i * o) / (chunk_height * chunk_width * chunk_in_channel * chunk_out_channel);
 }

 }  // namespace conv_utils
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	#include <type_traits>

	#include "conv2d.h"

	namespace tvm {
	namespace runtime {
	namespace hexagon {
	namespace conv_utils {

	/**
	* @brief Convert the layout of weights from flat to "chunked". The term chunked is explained below:
	*
	* Weights are packed into the below mentioned layout (notation similar to index map):
	* Since weights cannot be exactly represented into a index map notation, the
	* base split up is mentioned below with a few deviations
	*
	* lambda h, w, i, o: o//32, i//32, h, w, (i%32)//4, o%32, i%4
	*
	* The deviations are:
	* - w is actually stored in the right to left order, as in 3,2,1,0 instead of 0,1,2,3
	*
	* @param out_ptr Base pointer table to be filled with the list of pointers to the first addresses
	* of the "chunked" weights
	* @param out_ptr_size The number of chunks
	* @param out Pointer to pre-allocated output memory
	* @param inp Pointer to flat input data
	* @param height
	* @param width
	* @param idepth
	* @param odepth
	*/
	void chunkify_hwio_8b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height, int width,
	int idepth, int odepth, int wgt_zp) {
	auto inp_data = static_cast<int8_t*>(inp);
	auto out_data = static_cast<int8_t*>(out);
	const int stride_i = odepth;
	const int stride_x = stride_i * idepth;
	const int stride_y = stride_x * width;

	for (int ci = 0; ci < idepth; ci += 32) {
	for (int co = 0; co < odepth; co += 32) {
	int max_i = std::min(32, idepth - ci);
	int max_o = std::min(32, odepth - co);

	auto chunk = out_data;
	for (int y = 0; y < height; ++y) {
	for (int x = width - 1; x >= 0; --x) {
	for (int i = 0; i < max_i; ++i) {
	for (int o = 0; o < max_o; ++o) {
	chunk[hwio_to_sm_8b(width, y, x, i, o)] =
	inp_data[y * stride_y + x * stride_x + (ci + i) * stride_i + (co + o)];
	}
	for (int o = max_o; o < 32; ++o) chunk[hwio_to_sm_8b(width, y, x, i, o)] = wgt_zp;
	}
	for (int i = max_i; i < 32; ++i)
	for (int o = 0; o < 32; ++o) chunk[hwio_to_sm_8b(width, y, x, i, o)] = wgt_zp;
	}
	}

	*out_ptr++ = chunk;
	out_data += height * width * 32 * 32;
	out_ptr_size--;
	assert(out_ptr_size >= 0);
	}
	}
	}

	/**
	* @brief Convert the layout of weights from flat to "chunked". The term chunked is explained below:
	*
	* Weights are packed into the below mentioned layout (notation similar to index map):
	* Since weights cannot be exactly represented into a index map notation, the
	* base split up is mentioned below with a few gotchas
	*
	* lambda h, w, i, o: h//8, w//4, o//32, i//32, h%8, w%4, (i%32)//2, o%32, i%2
	*
	* The gotchas are:
	* - (w%4) is actually stored in the right to left order, as in 3,2,1,0 instead of 0,1,2,3
	* - The h%8 and (w%4) dimensions are not padded up, leading to chunks of different sizes
	* (thereby the name "chunked" instead of packed)
	* - The thinnest chunk of width is stored first. For example, if a kernel is 5x5, the first
	* chunk along the width has size 1 (representing index 0) and then next one has size 4
	* representing indices (1,2,3,4)
	*
	* @param out_ptr Base pointer table to be filled with the list of pointers to the first addresses
	* of the "chunked" weights
	* @param out_ptr_size The number of chunks
	* @param out Pointer to pre-allocated output memory
	* @param inp Pointer to flat input data
	* @param height
	* @param width
	* @param idepth
	* @param odepth
	*/
	void chunkify_hwio_16b(void** out_ptr, int out_ptr_size, void* out, void* inp, int height,
	int width, int idepth, int odepth) {
	auto inp_data = static_cast<uint16_t*>(inp);
	auto out_data = static_cast<uint16_t*>(out);
	const int stride_i = odepth;
	const int stride_x = stride_i * idepth;
	const int stride_y = stride_x * width;

	for (int cy = 0; cy < height; cy += 8) {
	// In the chunkified tensor, the chunks are ordered in increasing
	// x order, but they start from the thin one.
	for (int cx = width - round_up(width, 4); cx < width; cx += 4) {
	int cx0 = std::max(0, cx);
	for (int ci = 0; ci < idepth; ci += 32) {
	for (int co = 0; co < odepth; co += 32) {
	int max_y = std::min(8, height - cy);
	int max_x = std::min(4, cx + 4 - cx0);
	int max_i = std::min(32, idepth - ci);
	int max_o = std::min(32, odepth - co);

	auto chunk = out_data;
	for (int y = 0; y < max_y; ++y) {
	for (int x = max_x - 1; x >= 0; --x) {
	for (int i = 0; i < max_i; ++i) {
	for (int o = 0; o < max_o; ++o) {
	chunk[hwio_to_sm_16b(max_x, y, x, i, o)] =
	inp_data[(cy + y) * stride_y + (cx0 + x) * stride_x + (ci + i) * stride_i +
	(co + o)];
	}
	for (int o = max_o; o < 32; ++o) chunk[hwio_to_sm_16b(max_x, y, x, i, o)] = 0;
	}
	for (int i = max_i; i < 32; ++i)
	for (int o = 0; o < 32; ++o) chunk[hwio_to_sm_16b(max_x, y, x, i, o)] = 0;
	}
	}

	*out_ptr++ = chunk;
	out_data += max_y * max_x * 32 * 32;
	out_ptr_size--;
	assert(out_ptr_size >= 0);
	}
	}
	}
	}
	}

	std::tuple<int, int, int, int> getHWIO(const DLTensor* hwio_flat) {
	int h = hwio_flat->shape[0];
	int w = hwio_flat->shape[1];
	int i = round_up(hwio_flat->shape[2], 32);
	int o = round_up(hwio_flat->shape[3], 32);
	return std::make_tuple(h, w, i, o);
	}

	SDLTensor<4> prepare_hwio_8b(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
	int num_chunks, void** ptr_table, int wgt_zp) {
	tvm::ffi::String vtcm_scope = "global.vtcm";

	auto [h, w, i, o] = getHWIO(hwio_flat);
	int64_t shape_1d[] = {h * w * i * o};
	void* hwio_vtcm =
	device_api->AllocDataSpace(hexagon_device, 1, shape_1d, hwio_flat->dtype, vtcm_scope);

	chunkify_hwio_8b(ptr_table, num_chunks, hwio_vtcm, hwio_flat->data, hwio_flat->shape[0],
	hwio_flat->shape[1], hwio_flat->shape[2], hwio_flat->shape[3], wgt_zp);

	return SDLTensor<4>(ptr_table, hwio_flat->dtype, hwio_vtcm, {1, 1, i / 32, o / 32});
	}

	SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat,
	int num_chunks, void** ptr_table) {
	tvm::ffi::String vtcm_scope = "global.vtcm";

	// Allocate one block for filter data. We will need to create our own
	// pointer table. The reason is that filter chunks cannot be padded
	// height- or width-wise, so filter chunks may have different sizes.
	// A filter chunk is a block of size HxWx32x32, where H, W are at most
	// height and width of a block respectively.
	auto [h, w, i, o] = getHWIO(hwio_flat);
	int64_t shape_1d[] = {h * w * i * o};
	void* hwio_vtcm =
	device_api->AllocDataSpace(hexagon_device, 1, shape_1d, hwio_flat->dtype, vtcm_scope);

	chunkify_hwio_16b(ptr_table, num_chunks, hwio_vtcm, hwio_flat->data, hwio_flat->shape[0],
	hwio_flat->shape[1], hwio_flat->shape[2], hwio_flat->shape[3]);

	return SDLTensor<4>(ptr_table, hwio_flat->dtype, hwio_vtcm,
	{round_up(h, 8) / 8, round_up(w, 4) / 4, i / 32, o / 32});
	}

	int calculate_num_weight_chunks(int64_t* shape_hwio, int chunk_height, int chunk_width,
	int chunk_in_channel, int chunk_out_channel) {
	// Define slower roundup that doesn't assume multiplier 'p' to be power of 2
	auto roundup = [](int v, int p) { return (v + p - 1) - ((v + p - 1) % p); };
	int h = roundup(shape_hwio[0], chunk_height);
	int w = roundup(shape_hwio[1], chunk_width);
	int i = roundup(shape_hwio[2], chunk_in_channel);
	int o = roundup(shape_hwio[3], chunk_out_channel);

	return (h * w * i * o) / (chunk_height * chunk_width * chunk_in_channel * chunk_out_channel);
	}

	} // namespace conv_utils
	} // namespace hexagon
	} // namespace runtime
	} // namespace tvm