Optimize Linux shared library modules (*.so files) (#2445)

diff --git a/include/vta/runtime.h b/include/vta/runtime.h
index e58d454..5af9156 100644
--- a/include/vta/runtime.h
+++ b/include/vta/runtime.h
@@ -11,6 +11,7 @@
 extern "C" {
 #endif
 
+#include <tvm/runtime/c_runtime_api.h>
 #include "driver.h"
 
 #define VTA_MEMCPY_H2D 1
@@ -28,13 +29,13 @@
  * \param size Buffer size.
  * \return A pointer to the allocated buffer.
  */
-void* VTABufferAlloc(size_t size);
+TVM_DLL void* VTABufferAlloc(size_t size);
 
 /*!
  * \brief Free data buffer.
  * \param buffer The data buffer to be freed.
  */
-void VTABufferFree(void* buffer);
+TVM_DLL void VTABufferFree(void* buffer);
 
 /*!
  * \brief Copy data buffer from one location to another.
@@ -45,24 +46,24 @@
  * \param size Size of copy.
  * \param kind_mask The memory copy kind.
  */
-void VTABufferCopy(const void* from,
-                   size_t from_offset,
-                   void* to,
-                   size_t to_offset,
-                   size_t size,
-                   int kind_mask);
+TVM_DLL void VTABufferCopy(const void* from,
+                           size_t from_offset,
+                           void* to,
+                           size_t to_offset,
+                           size_t size,
+                           int kind_mask);
 
 /*! \brief VTA command handle */
 typedef void* VTACommandHandle;
 
 /*! \brief Shutdown hook of VTA to cleanup resources */
-void VTARuntimeShutdown();
+TVM_DLL void VTARuntimeShutdown();
 
 /*!
  * \brief Get thread local command handle.
  * \return A thread local command handle.
  */
-VTACommandHandle VTATLSCommandHandle();
+TVM_DLL VTACommandHandle VTATLSCommandHandle();
 
 /*!
  * \brief Get the buffer access pointer on CPU.
@@ -70,7 +71,7 @@
  * \param buffer The data buffer.
  * \return The pointer that can be accessed by the CPU.
  */
-void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer);
+TVM_DLL void* VTABufferCPUPtr(VTACommandHandle cmd, void* buffer);
 
 /*!
  * \brief Perform a write barrier to make a memory region visible to the CPU.
@@ -80,11 +81,11 @@
  * \param start The start of the region (in elements).
  * \param extent The end of the region (in elements).
  */
-void VTAWriteBarrier(VTACommandHandle cmd,
-                     void* buffer,
-                     uint32_t elem_bits,
-                     uint32_t start,
-                     uint32_t extent);
+TVM_DLL void VTAWriteBarrier(VTACommandHandle cmd,
+                             void* buffer,
+                             uint32_t elem_bits,
+                             uint32_t start,
+                             uint32_t extent);
 /*!
  * \brief Perform a read barrier to a memory region visible to VTA.
  * \param cmd The VTA command handle.
@@ -93,18 +94,18 @@
  * \param start The start of the region (in elements).
  * \param extent The end of the region (in elements).
  */
-void VTAReadBarrier(VTACommandHandle cmd,
-                    void* buffer,
-                    uint32_t elem_bits,
-                    uint32_t start,
-                    uint32_t extent);
+TVM_DLL void VTAReadBarrier(VTACommandHandle cmd,
+                            void* buffer,
+                            uint32_t elem_bits,
+                            uint32_t start,
+                            uint32_t extent);
 
 /*!
  * \brief Set debug mode on the command handle.
  * \param cmd The VTA command handle.
  * \param debug_flag The debug flag.
  */
-void VTASetDebugMode(VTACommandHandle cmd, int debug_flag);
+TVM_DLL void VTASetDebugMode(VTACommandHandle cmd, int debug_flag);
 
 /*!
  * \brief Perform a 2D data load from DRAM.
@@ -122,18 +123,18 @@
  * \param dst_sram_index Destination SRAM index.
  * \param dst_memory_type Destination memory type.
  */
-void VTALoadBuffer2D(VTACommandHandle cmd,
-                     void* src_dram_addr,
-                     uint32_t src_elem_offset,
-                     uint32_t x_size,
-                     uint32_t y_size,
-                     uint32_t x_stride,
-                     uint32_t x_pad_before,
-                     uint32_t y_pad_before,
-                     uint32_t x_pad_after,
-                     uint32_t y_pad_after,
-                     uint32_t dst_sram_index,
-                     uint32_t dst_memory_type);
+TVM_DLL void VTALoadBuffer2D(VTACommandHandle cmd,
+                             void* src_dram_addr,
+                             uint32_t src_elem_offset,
+                             uint32_t x_size,
+                             uint32_t y_size,
+                             uint32_t x_stride,
+                             uint32_t x_pad_before,
+                             uint32_t y_pad_before,
+                             uint32_t x_pad_after,
+                             uint32_t y_pad_after,
+                             uint32_t dst_sram_index,
+                             uint32_t dst_memory_type);
 
 /*!
  * \brief Perform a 2D data store into DRAM
@@ -147,14 +148,14 @@
  * \param y_size The number of rows.
  * \param x_stride The x axis stride.
  */
-void VTAStoreBuffer2D(VTACommandHandle cmd,
-                      uint32_t src_sram_index,
-                      uint32_t src_memory_type,
-                      void* dst_dram_addr,
-                      uint32_t dst_elem_offset,
-                      uint32_t x_size,
-                      uint32_t y_size,
-                      uint32_t x_stride);
+TVM_DLL void VTAStoreBuffer2D(VTACommandHandle cmd,
+                              uint32_t src_sram_index,
+                              uint32_t src_memory_type,
+                              void* dst_dram_addr,
+                              uint32_t dst_elem_offset,
+                              uint32_t x_size,
+                              uint32_t y_size,
+                              uint32_t x_stride);
 
 /*!
  * \brief Push uop into kernel buffer.
@@ -187,14 +188,14 @@
  * \param use_imm Use immediate in ALU mode if set to true.
  * \param imm_val Immediate value in ALU mode.
  */
-void VTAUopPush(uint32_t mode,
-                uint32_t reset_out,
-                uint32_t dst_index,
-                uint32_t src_index,
-                uint32_t wgt_index,
-                uint32_t opcode,
-                uint32_t use_imm,
-                int32_t imm_val);
+TVM_DLL void VTAUopPush(uint32_t mode,
+                        uint32_t reset_out,
+                        uint32_t dst_index,
+                        uint32_t src_index,
+                        uint32_t wgt_index,
+                        uint32_t opcode,
+                        uint32_t use_imm,
+                        int32_t imm_val);
 
 /*!
  * \brief Mark start of a micro op loop.
@@ -203,15 +204,15 @@
  * \param src_factor The input factor.
  * \param wgt_factor The weight factor.
  */
-void VTAUopLoopBegin(uint32_t extent,
-                     uint32_t dst_factor,
-                     uint32_t src_factor,
-                     uint32_t wgt_factor);
+TVM_DLL void VTAUopLoopBegin(uint32_t extent,
+                             uint32_t dst_factor,
+                             uint32_t src_factor,
+                             uint32_t wgt_factor);
 
 /*!
  * \brief Mark end of a micro op loop.
  */
-void VTAUopLoopEnd();
+TVM_DLL void VTAUopLoopEnd();
 
 /*!
  * \brief Push GEMM uop kernel into the command handle.
@@ -221,10 +222,10 @@
  * \param nbytes Number of bytes to in the closure arguments.
  * \return 0 if success.
  */
-int VTAPushGEMMOp(void** uop_handle,
-                  int (*finit)(void*),
-                  void* signature,
-                  int nbytes);
+TVM_DLL int VTAPushGEMMOp(void** uop_handle,
+                          int (*finit)(void*),
+                          void* signature,
+                          int nbytes);
 
 /*!
  * \brief Push ALU uop kernel into the command handle.
@@ -234,10 +235,10 @@
  * \param nbytes Number of bytes to in the closure arguments.
  * \return 0 if success.
  */
-int VTAPushALUOp(void** uop_handle,
-                 int (*finit)(void*),
-                 void* signature,
-                 int nbytes);
+TVM_DLL int VTAPushALUOp(void** uop_handle,
+                         int (*finit)(void*),
+                         void* signature,
+                         int nbytes);
 
 /*!
  * \brief Push dependence token.
@@ -246,7 +247,7 @@
  * \param to_qid The destination queue.
  * \return 0 if success.
  */
-int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid);
+TVM_DLL int VTADepPush(VTACommandHandle cmd, int from_qid, int to_qid);
 
 /*!
  * \brief Pop dependence signal.
@@ -255,7 +256,7 @@
  * \param to_qid The destination queue.
  * \return 0 if success.
  */
-int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid);
+TVM_DLL int VTADepPop(VTACommandHandle cmd, int from_qid, int to_qid);
 
 /*!
  * \brief Synchronize the command handle.
@@ -266,7 +267,7 @@
  * \param wait_cycles The limit of poll cycles.
  *
  */
-void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles);
+TVM_DLL void VTASynchronize(VTACommandHandle cmd, uint32_t wait_cycles);
 
 #ifdef __cplusplus
 }
diff --git a/src/runtime.cc b/src/runtime.cc
index ffa0096..88d4007 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -10,6 +10,7 @@
 #include <vta/hw_spec.h>
 #include <vta/runtime.h>
 #include <dmlc/logging.h>
+#include <tvm/runtime/c_runtime_api.h>
 
 #include <cassert>
 #include <cstring>